1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 8 9define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, 10; GFX7-LABEL: udot8_acc32: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 13; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 14; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 15; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 16; GFX7-NEXT: s_mov_b32 s26, -1 17; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 19; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 20; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 21; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 22; GFX7-NEXT: s_add_u32 s24, s24, s3 23; GFX7-NEXT: s_addc_u32 s25, s25, 0 24; GFX7-NEXT: s_waitcnt lgkmcnt(0) 25; GFX7-NEXT: s_lshr_b32 s7, s6, 28 26; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 27; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 28; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 29; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c 30; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 31; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 32; GFX7-NEXT: s_and_b32 s6, s6, 15 33; GFX7-NEXT: s_lshr_b32 s5, s4, 28 34; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 35; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 36; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 37; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c 38; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 39; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 40; GFX7-NEXT: s_and_b32 s4, s4, 15 41; GFX7-NEXT: v_mov_b32_e32 v0, s6 42; GFX7-NEXT: v_mov_b32_e32 v1, s20 43; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 44; GFX7-NEXT: v_mov_b32_e32 v1, s19 45; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 46; GFX7-NEXT: v_mov_b32_e32 v1, s18 47; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 48; GFX7-NEXT: v_mov_b32_e32 v1, s17 49; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 50; GFX7-NEXT: v_mov_b32_e32 v1, s16 51; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 52; GFX7-NEXT: v_mov_b32_e32 v1, s15 53; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 54; GFX7-NEXT: v_mov_b32_e32 v1, s14 55; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 56; GFX7-NEXT: v_mov_b32_e32 v1, s7 57; GFX7-NEXT: s_mov_b32 s3, 0xf000 58; GFX7-NEXT: s_mov_b32 s2, -1 59; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 60; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 61; GFX7-NEXT: s_endpgm 62; 63; GFX8-LABEL: udot8_acc32: 64; GFX8: ; %bb.0: ; %entry 65; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 66; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 67; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 68; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 69; GFX8-NEXT: s_mov_b32 s22, -1 70; GFX8-NEXT: s_waitcnt lgkmcnt(0) 71; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 72; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 73; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 74; GFX8-NEXT: s_mov_b32 s23, 0xe80000 75; GFX8-NEXT: s_add_u32 s20, s20, s3 76; GFX8-NEXT: s_addc_u32 s21, s21, 0 77; GFX8-NEXT: s_waitcnt lgkmcnt(0) 78; GFX8-NEXT: s_lshr_b32 s7, s6, 28 79; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 80; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 81; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 82; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c 83; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 84; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 85; GFX8-NEXT: s_and_b32 s6, s6, 15 86; GFX8-NEXT: s_lshr_b32 s3, s2, 28 87; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 88; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 89; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 90; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c 91; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 92; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 93; GFX8-NEXT: s_and_b32 s2, s2, 15 94; GFX8-NEXT: v_mov_b32_e32 v0, s6 95; GFX8-NEXT: v_mov_b32_e32 v1, s18 96; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 97; GFX8-NEXT: v_mov_b32_e32 v1, s17 98; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 99; GFX8-NEXT: v_mov_b32_e32 v1, s16 100; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 101; GFX8-NEXT: v_mov_b32_e32 v1, s15 102; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 103; GFX8-NEXT: v_mov_b32_e32 v1, s14 104; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 105; GFX8-NEXT: v_mov_b32_e32 v1, s13 106; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 107; GFX8-NEXT: v_mov_b32_e32 v1, s12 108; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 109; GFX8-NEXT: v_mov_b32_e32 v1, s7 110; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 111; GFX8-NEXT: v_mov_b32_e32 v0, s0 112; GFX8-NEXT: v_mov_b32_e32 v1, s1 113; GFX8-NEXT: flat_store_dword v[0:1], v2 114; GFX8-NEXT: s_endpgm 115; 116; GFX9-LABEL: udot8_acc32: 117; GFX9: ; %bb.0: ; %entry 118; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 119; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 120; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 121; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 122; GFX9-NEXT: s_mov_b32 s22, -1 123; GFX9-NEXT: s_waitcnt lgkmcnt(0) 124; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 125; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 126; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 127; GFX9-NEXT: s_mov_b32 s23, 0xe00000 128; GFX9-NEXT: s_add_u32 s20, s20, s3 129; GFX9-NEXT: s_addc_u32 s21, s21, 0 130; GFX9-NEXT: s_waitcnt lgkmcnt(0) 131; GFX9-NEXT: s_lshr_b32 s7, s6, 28 132; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 133; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 134; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 135; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c 136; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 137; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 138; GFX9-NEXT: s_and_b32 s6, s6, 15 139; GFX9-NEXT: s_lshr_b32 s3, s2, 28 140; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 141; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 142; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 143; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c 144; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 145; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 146; GFX9-NEXT: s_and_b32 s2, s2, 15 147; GFX9-NEXT: v_mov_b32_e32 v1, s6 148; GFX9-NEXT: v_mov_b32_e32 v2, s18 149; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 150; GFX9-NEXT: v_mov_b32_e32 v2, s17 151; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 152; GFX9-NEXT: v_mov_b32_e32 v2, s16 153; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 154; GFX9-NEXT: v_mov_b32_e32 v2, s15 155; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 156; GFX9-NEXT: v_mov_b32_e32 v2, s14 157; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 158; GFX9-NEXT: v_mov_b32_e32 v2, s13 159; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 160; GFX9-NEXT: v_mov_b32_e32 v2, s12 161; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 162; GFX9-NEXT: v_mov_b32_e32 v2, s7 163; GFX9-NEXT: v_mov_b32_e32 v0, 0 164; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 165; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 166; GFX9-NEXT: s_endpgm 167; 168; GFX9-DL-LABEL: udot8_acc32: 169; GFX9-DL: ; %bb.0: ; %entry 170; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 171; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 172; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 173; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 174; GFX9-DL-NEXT: s_mov_b32 s10, -1 175; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 176; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 177; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 178; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 179; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 180; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 181; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 182; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 183; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 184; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 185; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 186; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 187; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 188; GFX9-DL-NEXT: s_endpgm 189; 190; GFX10-DL-LABEL: udot8_acc32: 191; GFX10-DL: ; %bb.0: ; %entry 192; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 193; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 194; GFX10-DL-NEXT: s_mov_b32 s10, -1 195; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 196; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 197; GFX10-DL-NEXT: s_clause 0x1 198; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 199; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 200; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 201; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 202; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 203; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 204; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 205; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 206; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 207; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 208; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 209; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 210; GFX10-DL-NEXT: s_endpgm 211 <8 x i4> addrspace(1)* %src2, 212 i32 addrspace(1)* nocapture %dst) { 213entry: 214 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 215 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 216 217 %v1e0 = extractelement <8 x i4> %vec1, i64 0 218 %cv1e0 = zext i4 %v1e0 to i32 219 %v2e0 = extractelement <8 x i4> %vec2, i64 0 220 %cv2e0 = zext i4 %v2e0 to i32 221 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 222 223 %v1e1 = extractelement <8 x i4> %vec1, i64 1 224 %cv1e1 = zext i4 %v1e1 to i32 225 %v2e1 = extractelement <8 x i4> %vec2, i64 1 226 %cv2e1 = zext i4 %v2e1 to i32 227 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 228 229 %v1e2 = extractelement <8 x i4> %vec1, i64 2 230 %cv1e2 = zext i4 %v1e2 to i32 231 %v2e2 = extractelement <8 x i4> %vec2, i64 2 232 %cv2e2 = zext i4 %v2e2 to i32 233 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 234 235 %v1e3 = extractelement <8 x i4> %vec1, i64 3 236 %cv1e3 = zext i4 %v1e3 to i32 237 %v2e3 = extractelement <8 x i4> %vec2, i64 3 238 %cv2e3 = zext i4 %v2e3 to i32 239 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 240 241 %v1e4 = extractelement <8 x i4> %vec1, i64 4 242 %cv1e4 = zext i4 %v1e4 to i32 243 %v2e4 = extractelement <8 x i4> %vec2, i64 4 244 %cv2e4 = zext i4 %v2e4 to i32 245 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 246 247 %v1e5 = extractelement <8 x i4> %vec1, i64 5 248 %cv1e5 = zext i4 %v1e5 to i32 249 %v2e5 = extractelement <8 x i4> %vec2, i64 5 250 %cv2e5 = zext i4 %v2e5 to i32 251 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 252 253 %v1e6 = extractelement <8 x i4> %vec1, i64 6 254 %cv1e6 = zext i4 %v1e6 to i32 255 %v2e6 = extractelement <8 x i4> %vec2, i64 6 256 %cv2e6 = zext i4 %v2e6 to i32 257 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 258 259 %v1e7 = extractelement <8 x i4> %vec1, i64 7 260 %cv1e7 = zext i4 %v1e7 to i32 261 %v2e7 = extractelement <8 x i4> %vec2, i64 7 262 %cv2e7 = zext i4 %v2e7 to i32 263 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 264 265 %acc = load i32, i32 addrspace(1)* %dst, align 4 266 %add1 = add i32 %mul0, %acc 267 %add2 = add i32 %add1, %mul1 268 %add3 = add i32 %add2, %mul2 269 %add4 = add i32 %add3, %mul3 270 %add5 = add i32 %add4, %mul4 271 %add6 = add i32 %add5, %mul5 272 %add7 = add i32 %add6, %mul6 273 %add8 = add i32 %add7, %mul7 274 275 store i32 %add8, i32 addrspace(1)* %dst, align 4 276 ret void 277} 278 279; TODO: Remove the unnecessary instruction(that is zero-extending the 280; 2nd MAD) to have the pattern-recognizer to kick in. 281define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, 282; GFX7-LABEL: udot8_acc16: 283; GFX7: ; %bb.0: ; %entry 284; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 285; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 286; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 287; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 288; GFX7-NEXT: s_mov_b32 s22, -1 289; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 290; GFX7-NEXT: s_add_u32 s20, s20, s3 291; GFX7-NEXT: s_mov_b32 s3, 0xf000 292; GFX7-NEXT: s_mov_b32 s2, -1 293; GFX7-NEXT: s_waitcnt lgkmcnt(0) 294; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 295; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 296; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 297; GFX7-NEXT: s_addc_u32 s21, s21, 0 298; GFX7-NEXT: s_waitcnt lgkmcnt(0) 299; GFX7-NEXT: s_lshr_b32 s6, s4, 28 300; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 301; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 302; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 303; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c 304; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 305; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 306; GFX7-NEXT: s_lshr_b32 s13, s5, 28 307; GFX7-NEXT: s_and_b32 s5, s5, 15 308; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 309; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 310; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 311; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c 312; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 313; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 314; GFX7-NEXT: s_and_b32 s4, s4, 15 315; GFX7-NEXT: v_mov_b32_e32 v1, s5 316; GFX7-NEXT: v_mov_b32_e32 v2, s19 317; GFX7-NEXT: v_mov_b32_e32 v3, s18 318; GFX7-NEXT: v_mov_b32_e32 v4, s17 319; GFX7-NEXT: v_mov_b32_e32 v5, s16 320; GFX7-NEXT: v_mov_b32_e32 v6, s15 321; GFX7-NEXT: v_mov_b32_e32 v7, s14 322; GFX7-NEXT: s_waitcnt vmcnt(0) 323; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 324; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 325; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 326; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 327; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 328; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 329; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 330; GFX7-NEXT: v_mov_b32_e32 v1, s13 331; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 332; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 333; GFX7-NEXT: s_endpgm 334; 335; GFX8-LABEL: udot8_acc16: 336; GFX8: ; %bb.0: ; %entry 337; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 338; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 339; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 340; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 341; GFX8-NEXT: s_mov_b32 s18, -1 342; GFX8-NEXT: s_mov_b32 s19, 0xe80000 343; GFX8-NEXT: s_waitcnt lgkmcnt(0) 344; GFX8-NEXT: v_mov_b32_e32 v0, s0 345; GFX8-NEXT: v_mov_b32_e32 v1, s1 346; GFX8-NEXT: flat_load_ushort v2, v[0:1] 347; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 348; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 349; GFX8-NEXT: s_add_u32 s16, s16, s3 350; GFX8-NEXT: s_addc_u32 s17, s17, 0 351; GFX8-NEXT: s_waitcnt lgkmcnt(0) 352; GFX8-NEXT: s_lshr_b32 s2, s0, 28 353; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 354; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 355; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 356; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c 357; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 358; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 359; GFX8-NEXT: s_lshr_b32 s9, s1, 28 360; GFX8-NEXT: s_and_b32 s1, s1, 15 361; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 362; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 363; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 364; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c 365; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 366; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 367; GFX8-NEXT: s_and_b32 s0, s0, 15 368; GFX8-NEXT: v_mov_b32_e32 v3, s1 369; GFX8-NEXT: v_mov_b32_e32 v4, s15 370; GFX8-NEXT: v_mov_b32_e32 v5, s14 371; GFX8-NEXT: v_mov_b32_e32 v6, s13 372; GFX8-NEXT: v_mov_b32_e32 v7, s12 373; GFX8-NEXT: v_mov_b32_e32 v8, s11 374; GFX8-NEXT: v_mov_b32_e32 v9, s10 375; GFX8-NEXT: s_waitcnt vmcnt(0) 376; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 377; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 378; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 379; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 380; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 381; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 382; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 383; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 384; GFX8-NEXT: v_mov_b32_e32 v3, s9 385; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 386; GFX8-NEXT: flat_store_short v[0:1], v2 387; GFX8-NEXT: s_endpgm 388; 389; GFX9-LABEL: udot8_acc16: 390; GFX9: ; %bb.0: ; %entry 391; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 392; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 393; GFX9-NEXT: v_mov_b32_e32 v0, 0 394; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 395; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 396; GFX9-NEXT: s_mov_b32 s22, -1 397; GFX9-NEXT: s_waitcnt lgkmcnt(0) 398; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 399; GFX9-NEXT: s_mov_b32 s23, 0xe00000 400; GFX9-NEXT: s_add_u32 s20, s20, s3 401; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 402; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 403; GFX9-NEXT: s_addc_u32 s21, s21, 0 404; GFX9-NEXT: s_waitcnt lgkmcnt(0) 405; GFX9-NEXT: s_lshr_b32 s4, s2, 28 406; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 407; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 408; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 409; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c 410; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008 411; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004 412; GFX9-NEXT: s_lshr_b32 s11, s3, 28 413; GFX9-NEXT: s_and_b32 s3, s3, 15 414; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 415; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 416; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 417; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c 418; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 419; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 420; GFX9-NEXT: s_and_b32 s2, s2, 15 421; GFX9-NEXT: v_mov_b32_e32 v2, s3 422; GFX9-NEXT: v_mov_b32_e32 v3, s17 423; GFX9-NEXT: v_mov_b32_e32 v4, s16 424; GFX9-NEXT: v_mov_b32_e32 v5, s15 425; GFX9-NEXT: v_mov_b32_e32 v6, s14 426; GFX9-NEXT: v_mov_b32_e32 v7, s13 427; GFX9-NEXT: v_mov_b32_e32 v8, s12 428; GFX9-NEXT: s_waitcnt vmcnt(0) 429; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 430; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 431; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 432; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 433; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 434; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 435; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 436; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 437; GFX9-NEXT: v_mov_b32_e32 v2, s11 438; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 439; GFX9-NEXT: global_store_short v0, v1, s[0:1] 440; GFX9-NEXT: s_endpgm 441; 442; GFX9-DL-LABEL: udot8_acc16: 443; GFX9-DL: ; %bb.0: ; %entry 444; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 445; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 446; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 447; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 448; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 449; GFX9-DL-NEXT: s_mov_b32 s22, -1 450; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 451; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] 452; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 453; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 454; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 455; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 456; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 457; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 458; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 459; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 460; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 461; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 462; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c 463; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008 464; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004 465; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 466; GFX9-DL-NEXT: s_and_b32 s3, s3, 15 467; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 468; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 469; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 470; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c 471; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 472; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 473; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 474; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 475; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 476; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 477; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 478; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 479; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 480; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 481; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 482; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 483; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 484; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 485; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 486; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 487; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 488; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 489; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 490; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 491; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 492; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 493; GFX9-DL-NEXT: s_endpgm 494; 495; GFX10-DL-LABEL: udot8_acc16: 496; GFX10-DL: ; %bb.0: ; %entry 497; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 498; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 499; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 500; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 501; GFX10-DL-NEXT: s_mov_b32 s10, -1 502; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 503; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 504; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 505; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 506; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 507; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 508; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 509; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 510; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 511; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 512; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 513; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 514; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 515; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 516; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 517; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 518; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 519; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 520; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 521; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 522; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c 523; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c 524; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 525; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 526; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 527; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 528; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 529; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 530; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 531; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 532; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 533; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 534; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 535; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 536; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 537; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 538; GFX10-DL-NEXT: s_endpgm 539 <8 x i4> addrspace(1)* %src2, 540 i16 addrspace(1)* nocapture %dst) { 541entry: 542 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 543 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 544 545 %v1e0 = extractelement <8 x i4> %vec1, i64 0 546 %cv1e0 = zext i4 %v1e0 to i16 547 %v2e0 = extractelement <8 x i4> %vec2, i64 0 548 %cv2e0 = zext i4 %v2e0 to i16 549 %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0 550 551 %v1e1 = extractelement <8 x i4> %vec1, i64 1 552 %cv1e1 = zext i4 %v1e1 to i16 553 %v2e1 = extractelement <8 x i4> %vec2, i64 1 554 %cv2e1 = zext i4 %v2e1 to i16 555 %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1 556 557 %v1e2 = extractelement <8 x i4> %vec1, i64 2 558 %cv1e2 = zext i4 %v1e2 to i16 559 %v2e2 = extractelement <8 x i4> %vec2, i64 2 560 %cv2e2 = zext i4 %v2e2 to i16 561 %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2 562 563 %v1e3 = extractelement <8 x i4> %vec1, i64 3 564 %cv1e3 = zext i4 %v1e3 to i16 565 %v2e3 = extractelement <8 x i4> %vec2, i64 3 566 %cv2e3 = zext i4 %v2e3 to i16 567 %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3 568 569 %v1e4 = extractelement <8 x i4> %vec1, i64 4 570 %cv1e4 = zext i4 %v1e4 to i16 571 %v2e4 = extractelement <8 x i4> %vec2, i64 4 572 %cv2e4 = zext i4 %v2e4 to i16 573 %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4 574 575 %v1e5 = extractelement <8 x i4> %vec1, i64 5 576 %cv1e5 = zext i4 %v1e5 to i16 577 %v2e5 = extractelement <8 x i4> %vec2, i64 5 578 %cv2e5 = zext i4 %v2e5 to i16 579 %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5 580 581 %v1e6 = extractelement <8 x i4> %vec1, i64 6 582 %cv1e6 = zext i4 %v1e6 to i16 583 %v2e6 = extractelement <8 x i4> %vec2, i64 6 584 %cv2e6 = zext i4 %v2e6 to i16 585 %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6 586 587 %v1e7 = extractelement <8 x i4> %vec1, i64 7 588 %cv1e7 = zext i4 %v1e7 to i16 589 %v2e7 = extractelement <8 x i4> %vec2, i64 7 590 %cv2e7 = zext i4 %v2e7 to i16 591 %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7 592 593 %acc = load i16, i16 addrspace(1)* %dst, align 4 594 %add1 = add i16 %mul0, %acc 595 %add2 = add i16 %add1, %mul1 596 %add3 = add i16 %add2, %mul2 597 %add4 = add i16 %add3, %mul3 598 %add5 = add i16 %add4, %mul4 599 %add6 = add i16 %add5, %mul5 600 %add7 = add i16 %add6, %mul6 601 %add8 = add i16 %add7, %mul7 602 603 store i16 %add8, i16 addrspace(1)* %dst, align 4 604 ret void 605} 606 607; TODO: Remove the unnecessary instruction(that is zero-extending the 608; 2nd MAD) to have the pattern-recognizer to kick in. 609define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, 610; GFX7-LABEL: udot8_acc8: 611; GFX7: ; %bb.0: ; %entry 612; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 613; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 614; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 615; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 616; GFX7-NEXT: s_mov_b32 s22, -1 617; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 618; GFX7-NEXT: s_add_u32 s20, s20, s3 619; GFX7-NEXT: s_mov_b32 s3, 0xf000 620; GFX7-NEXT: s_mov_b32 s2, -1 621; GFX7-NEXT: s_waitcnt lgkmcnt(0) 622; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 623; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 624; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 625; GFX7-NEXT: s_addc_u32 s21, s21, 0 626; GFX7-NEXT: s_waitcnt lgkmcnt(0) 627; GFX7-NEXT: s_lshr_b32 s6, s4, 28 628; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 629; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 630; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 631; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c 632; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 633; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 634; GFX7-NEXT: s_lshr_b32 s13, s5, 28 635; GFX7-NEXT: s_and_b32 s5, s5, 15 636; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 637; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 638; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 639; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c 640; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 641; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 642; GFX7-NEXT: s_and_b32 s4, s4, 15 643; GFX7-NEXT: v_mov_b32_e32 v1, s5 644; GFX7-NEXT: v_mov_b32_e32 v2, s19 645; GFX7-NEXT: v_mov_b32_e32 v3, s18 646; GFX7-NEXT: v_mov_b32_e32 v4, s17 647; GFX7-NEXT: v_mov_b32_e32 v5, s16 648; GFX7-NEXT: v_mov_b32_e32 v6, s15 649; GFX7-NEXT: v_mov_b32_e32 v7, s14 650; GFX7-NEXT: s_waitcnt vmcnt(0) 651; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 652; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 653; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 654; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 655; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 656; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 657; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 658; GFX7-NEXT: v_mov_b32_e32 v1, s13 659; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 660; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 661; GFX7-NEXT: s_endpgm 662; 663; GFX8-LABEL: udot8_acc8: 664; GFX8: ; %bb.0: ; %entry 665; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 666; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 667; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 668; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 669; GFX8-NEXT: s_mov_b32 s18, -1 670; GFX8-NEXT: s_mov_b32 s19, 0xe80000 671; GFX8-NEXT: s_waitcnt lgkmcnt(0) 672; GFX8-NEXT: v_mov_b32_e32 v0, s0 673; GFX8-NEXT: v_mov_b32_e32 v1, s1 674; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 675; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 676; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 677; GFX8-NEXT: s_add_u32 s16, s16, s3 678; GFX8-NEXT: s_addc_u32 s17, s17, 0 679; GFX8-NEXT: s_waitcnt lgkmcnt(0) 680; GFX8-NEXT: s_lshr_b32 s2, s0, 28 681; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 682; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 683; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 684; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c 685; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 686; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 687; GFX8-NEXT: s_lshr_b32 s9, s1, 28 688; GFX8-NEXT: s_and_b32 s1, s1, 15 689; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 690; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 691; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 692; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c 693; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 694; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 695; GFX8-NEXT: s_and_b32 s0, s0, 15 696; GFX8-NEXT: v_mov_b32_e32 v3, s1 697; GFX8-NEXT: v_mov_b32_e32 v4, s15 698; GFX8-NEXT: v_mov_b32_e32 v5, s14 699; GFX8-NEXT: v_mov_b32_e32 v6, s13 700; GFX8-NEXT: v_mov_b32_e32 v7, s12 701; GFX8-NEXT: v_mov_b32_e32 v8, s11 702; GFX8-NEXT: v_mov_b32_e32 v9, s10 703; GFX8-NEXT: s_waitcnt vmcnt(0) 704; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 705; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 706; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 707; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 708; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 709; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 710; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 711; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 712; GFX8-NEXT: v_mov_b32_e32 v3, s9 713; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 714; GFX8-NEXT: flat_store_byte v[0:1], v2 715; GFX8-NEXT: s_endpgm 716; 717; GFX9-LABEL: udot8_acc8: 718; GFX9: ; %bb.0: ; %entry 719; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 720; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 721; GFX9-NEXT: v_mov_b32_e32 v0, 0 722; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 723; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 724; GFX9-NEXT: s_mov_b32 s22, -1 725; GFX9-NEXT: s_waitcnt lgkmcnt(0) 726; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] 727; GFX9-NEXT: s_mov_b32 s23, 0xe00000 728; GFX9-NEXT: s_add_u32 s20, s20, s3 729; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 730; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 731; GFX9-NEXT: s_addc_u32 s21, s21, 0 732; GFX9-NEXT: s_waitcnt lgkmcnt(0) 733; GFX9-NEXT: s_lshr_b32 s4, s2, 28 734; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 735; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 736; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 737; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c 738; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008 739; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004 740; GFX9-NEXT: s_lshr_b32 s11, s3, 28 741; GFX9-NEXT: s_and_b32 s3, s3, 15 742; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 743; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 744; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 745; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c 746; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 747; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 748; GFX9-NEXT: s_and_b32 s2, s2, 15 749; GFX9-NEXT: v_mov_b32_e32 v2, s3 750; GFX9-NEXT: v_mov_b32_e32 v3, s17 751; GFX9-NEXT: v_mov_b32_e32 v4, s16 752; GFX9-NEXT: v_mov_b32_e32 v5, s15 753; GFX9-NEXT: v_mov_b32_e32 v6, s14 754; GFX9-NEXT: v_mov_b32_e32 v7, s13 755; GFX9-NEXT: v_mov_b32_e32 v8, s12 756; GFX9-NEXT: s_waitcnt vmcnt(0) 757; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 758; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 759; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 760; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 761; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 762; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 763; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 764; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 765; GFX9-NEXT: v_mov_b32_e32 v2, s11 766; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 767; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 768; GFX9-NEXT: s_endpgm 769; 770; GFX9-DL-LABEL: udot8_acc8: 771; GFX9-DL: ; %bb.0: ; %entry 772; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 773; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 774; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 775; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 776; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 777; GFX9-DL-NEXT: s_mov_b32 s22, -1 778; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 779; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 780; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 781; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 782; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 783; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 784; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 785; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 786; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 787; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 788; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 789; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 790; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c 791; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008 792; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004 793; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 794; GFX9-DL-NEXT: s_and_b32 s3, s3, 15 795; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 796; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 797; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 798; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c 799; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 800; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 801; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 802; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 803; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 804; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 805; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 806; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 807; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 808; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 809; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 810; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 811; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 812; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 813; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 814; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 815; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 816; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 817; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 818; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 819; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 820; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 821; GFX9-DL-NEXT: s_endpgm 822; 823; GFX10-DL-LABEL: udot8_acc8: 824; GFX10-DL: ; %bb.0: ; %entry 825; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 826; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 827; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 828; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 829; GFX10-DL-NEXT: s_mov_b32 s10, -1 830; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 831; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 832; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 833; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 834; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 835; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 836; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 837; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 838; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 839; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 840; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 841; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 842; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 843; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 844; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 845; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 846; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 847; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 848; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 849; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 850; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c 851; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c 852; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 853; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 854; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 855; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 856; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 857; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 858; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 859; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 860; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 861; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 862; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 863; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 864; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 865; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 866; GFX10-DL-NEXT: s_endpgm 867 <8 x i4> addrspace(1)* %src2, 868 i8 addrspace(1)* nocapture %dst) { 869entry: 870 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 871 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 872 873 %v1e0 = extractelement <8 x i4> %vec1, i64 0 874 %cv1e0 = zext i4 %v1e0 to i8 875 %v2e0 = extractelement <8 x i4> %vec2, i64 0 876 %cv2e0 = zext i4 %v2e0 to i8 877 %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0 878 879 %v1e1 = extractelement <8 x i4> %vec1, i64 1 880 %cv1e1 = zext i4 %v1e1 to i8 881 %v2e1 = extractelement <8 x i4> %vec2, i64 1 882 %cv2e1 = zext i4 %v2e1 to i8 883 %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1 884 885 %v1e2 = extractelement <8 x i4> %vec1, i64 2 886 %cv1e2 = zext i4 %v1e2 to i8 887 %v2e2 = extractelement <8 x i4> %vec2, i64 2 888 %cv2e2 = zext i4 %v2e2 to i8 889 %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2 890 891 %v1e3 = extractelement <8 x i4> %vec1, i64 3 892 %cv1e3 = zext i4 %v1e3 to i8 893 %v2e3 = extractelement <8 x i4> %vec2, i64 3 894 %cv2e3 = zext i4 %v2e3 to i8 895 %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3 896 897 %v1e4 = extractelement <8 x i4> %vec1, i64 4 898 %cv1e4 = zext i4 %v1e4 to i8 899 %v2e4 = extractelement <8 x i4> %vec2, i64 4 900 %cv2e4 = zext i4 %v2e4 to i8 901 %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4 902 903 %v1e5 = extractelement <8 x i4> %vec1, i64 5 904 %cv1e5 = zext i4 %v1e5 to i8 905 %v2e5 = extractelement <8 x i4> %vec2, i64 5 906 %cv2e5 = zext i4 %v2e5 to i8 907 %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5 908 909 %v1e6 = extractelement <8 x i4> %vec1, i64 6 910 %cv1e6 = zext i4 %v1e6 to i8 911 %v2e6 = extractelement <8 x i4> %vec2, i64 6 912 %cv2e6 = zext i4 %v2e6 to i8 913 %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6 914 915 %v1e7 = extractelement <8 x i4> %vec1, i64 7 916 %cv1e7 = zext i4 %v1e7 to i8 917 %v2e7 = extractelement <8 x i4> %vec2, i64 7 918 %cv2e7 = zext i4 %v2e7 to i8 919 %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7 920 921 %acc = load i8, i8 addrspace(1)* %dst, align 4 922 %add1 = add i8 %mul0, %acc 923 %add2 = add i8 %add1, %mul1 924 %add3 = add i8 %add2, %mul2 925 %add4 = add i8 %add3, %mul3 926 %add5 = add i8 %add4, %mul4 927 %add6 = add i8 %add5, %mul5 928 %add7 = add i8 %add6, %mul6 929 %add8 = add i8 %add7, %mul7 930 931 store i8 %add8, i8 addrspace(1)* %dst, align 4 932 ret void 933} 934 935; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD) 936; to have the pattern-recognizer to kick in. 937define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, 938; GFX7-LABEL: udot8_acc4: 939; GFX7: ; %bb.0: ; %entry 940; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 941; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 942; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 943; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 944; GFX7-NEXT: s_mov_b32 s22, -1 945; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 946; GFX7-NEXT: s_add_u32 s20, s20, s3 947; GFX7-NEXT: s_mov_b32 s3, 0xf000 948; GFX7-NEXT: s_mov_b32 s2, -1 949; GFX7-NEXT: s_waitcnt lgkmcnt(0) 950; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 951; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 952; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 953; GFX7-NEXT: s_addc_u32 s21, s21, 0 954; GFX7-NEXT: s_waitcnt lgkmcnt(0) 955; GFX7-NEXT: s_lshr_b32 s6, s4, 28 956; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 957; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 958; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 959; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c 960; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 961; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 962; GFX7-NEXT: s_lshr_b32 s13, s5, 28 963; GFX7-NEXT: s_and_b32 s5, s5, 15 964; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 965; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 966; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 967; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c 968; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 969; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 970; GFX7-NEXT: s_and_b32 s4, s4, 15 971; GFX7-NEXT: v_mov_b32_e32 v1, s5 972; GFX7-NEXT: v_mov_b32_e32 v2, s19 973; GFX7-NEXT: v_mov_b32_e32 v3, s18 974; GFX7-NEXT: v_mov_b32_e32 v4, s17 975; GFX7-NEXT: v_mov_b32_e32 v5, s16 976; GFX7-NEXT: v_mov_b32_e32 v6, s15 977; GFX7-NEXT: v_mov_b32_e32 v7, s14 978; GFX7-NEXT: s_waitcnt vmcnt(0) 979; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 980; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 981; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 982; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 983; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 984; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 985; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 986; GFX7-NEXT: v_mov_b32_e32 v1, s13 987; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 988; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 989; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 990; GFX7-NEXT: s_endpgm 991; 992; GFX8-LABEL: udot8_acc4: 993; GFX8: ; %bb.0: ; %entry 994; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 995; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 996; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 997; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 998; GFX8-NEXT: s_mov_b32 s18, -1 999; GFX8-NEXT: s_mov_b32 s19, 0xe80000 1000; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1001; GFX8-NEXT: v_mov_b32_e32 v0, s0 1002; GFX8-NEXT: v_mov_b32_e32 v1, s1 1003; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 1004; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1005; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 1006; GFX8-NEXT: s_add_u32 s16, s16, s3 1007; GFX8-NEXT: s_addc_u32 s17, s17, 0 1008; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1009; GFX8-NEXT: s_and_b32 s8, s0, 15 1010; GFX8-NEXT: s_and_b32 s15, s1, 15 1011; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 1012; GFX8-NEXT: v_mov_b32_e32 v4, s15 1013; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 1014; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 1015; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 1016; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 1017; GFX8-NEXT: s_lshr_b32 s9, s1, 28 1018; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c 1019; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 1020; GFX8-NEXT: v_mov_b32_e32 v5, s14 1021; GFX8-NEXT: s_lshr_b32 s2, s0, 28 1022; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 1023; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 1024; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 1025; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 1026; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c 1027; GFX8-NEXT: v_mov_b32_e32 v3, s1 1028; GFX8-NEXT: v_mov_b32_e32 v6, s13 1029; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 1030; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 1031; GFX8-NEXT: v_mov_b32_e32 v7, s12 1032; GFX8-NEXT: v_mov_b32_e32 v8, s11 1033; GFX8-NEXT: v_mov_b32_e32 v9, s10 1034; GFX8-NEXT: s_waitcnt vmcnt(0) 1035; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 1036; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 1037; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 1038; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 1039; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 1040; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 1041; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 1042; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 1043; GFX8-NEXT: v_mov_b32_e32 v3, s9 1044; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 1045; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 1046; GFX8-NEXT: flat_store_byte v[0:1], v2 1047; GFX8-NEXT: s_endpgm 1048; 1049; GFX9-LABEL: udot8_acc4: 1050; GFX9: ; %bb.0: ; %entry 1051; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1052; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1053; GFX9-NEXT: v_mov_b32_e32 v0, 0 1054; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1055; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1056; GFX9-NEXT: s_mov_b32 s22, -1 1057; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1058; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] 1059; GFX9-NEXT: s_mov_b32 s23, 0xe00000 1060; GFX9-NEXT: s_add_u32 s20, s20, s3 1061; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 1062; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 1063; GFX9-NEXT: s_addc_u32 s21, s21, 0 1064; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1065; GFX9-NEXT: s_and_b32 s10, s2, 15 1066; GFX9-NEXT: s_and_b32 s17, s3, 15 1067; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 1068; GFX9-NEXT: v_mov_b32_e32 v3, s17 1069; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 1070; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 1071; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 1072; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 1073; GFX9-NEXT: s_lshr_b32 s11, s3, 28 1074; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c 1075; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 1076; GFX9-NEXT: v_mov_b32_e32 v4, s16 1077; GFX9-NEXT: s_lshr_b32 s4, s2, 28 1078; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 1079; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 1080; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 1081; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 1082; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c 1083; GFX9-NEXT: v_mov_b32_e32 v2, s3 1084; GFX9-NEXT: v_mov_b32_e32 v5, s15 1085; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 1086; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 1087; GFX9-NEXT: v_mov_b32_e32 v6, s14 1088; GFX9-NEXT: v_mov_b32_e32 v7, s13 1089; GFX9-NEXT: v_mov_b32_e32 v8, s12 1090; GFX9-NEXT: s_waitcnt vmcnt(0) 1091; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 1092; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 1093; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 1094; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1095; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 1096; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 1097; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 1098; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 1099; GFX9-NEXT: v_mov_b32_e32 v2, s11 1100; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1101; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1102; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 1103; GFX9-NEXT: s_endpgm 1104; 1105; GFX9-DL-LABEL: udot8_acc4: 1106; GFX9-DL: ; %bb.0: ; %entry 1107; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1108; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1109; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1110; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1111; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1112; GFX9-DL-NEXT: s_mov_b32 s22, -1 1113; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 1115; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 1116; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 1117; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1118; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 1119; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 1120; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1121; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 1122; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 1123; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 1124; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 1125; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 1126; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 1127; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 1128; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 1129; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 1130; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c 1131; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 1132; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 1133; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 1134; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 1135; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 1136; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 1137; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 1138; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c 1139; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 1140; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 1141; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 1142; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 1143; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 1144; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 1145; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 1146; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1147; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 1148; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 1149; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 1150; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1151; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 1152; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 1153; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 1154; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 1155; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 1156; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1157; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1158; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 1159; GFX9-DL-NEXT: s_endpgm 1160; 1161; GFX10-DL-LABEL: udot8_acc4: 1162; GFX10-DL: ; %bb.0: ; %entry 1163; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1164; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1165; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1166; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1167; GFX10-DL-NEXT: s_mov_b32 s10, -1 1168; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 1169; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 1170; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1171; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 1172; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1173; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 1174; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1175; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1176; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1177; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 1178; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 1179; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 1180; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c 1181; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1182; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1183; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 1184; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 1185; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1186; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 1187; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c 1188; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7 1189; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 1190; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 1191; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 1192; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 1193; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 1194; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 1195; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1196; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 1197; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 1198; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1199; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 1200; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 1201; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 1202; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 1203; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1204; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 1205; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 1206; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 1207; GFX10-DL-NEXT: s_endpgm 1208 <8 x i4> addrspace(1)* %src2, 1209 i4 addrspace(1)* nocapture %dst) { 1210entry: 1211 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 1212 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 1213 1214 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1215 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1216 %mul0 = mul nuw nsw i4 %v1e0, %v2e0 1217 1218 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1219 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1220 %mul1 = mul nuw nsw i4 %v1e1, %v2e1 1221 1222 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1223 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1224 %mul2 = mul nuw nsw i4 %v1e2, %v2e2 1225 1226 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1227 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1228 %mul3 = mul nuw nsw i4 %v1e3, %v2e3 1229 1230 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1231 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1232 %mul4 = mul nuw nsw i4 %v1e4, %v2e4 1233 1234 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1235 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1236 %mul5 = mul nuw nsw i4 %v1e5, %v2e5 1237 1238 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1239 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1240 %mul6 = mul nuw nsw i4 %v1e6, %v2e6 1241 1242 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1243 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1244 %mul7 = mul nuw nsw i4 %v1e7, %v2e7 1245 1246 %acc = load i4, i4 addrspace(1)* %dst, align 4 1247 %add1 = add i4 %mul0, %acc 1248 %add2 = add i4 %add1, %mul1 1249 %add3 = add i4 %add2, %mul2 1250 %add4 = add i4 %add3, %mul3 1251 %add5 = add i4 %add4, %mul4 1252 %add6 = add i4 %add5, %mul5 1253 %add7 = add i4 %add6, %mul6 1254 %add8 = add i4 %add7, %mul7 1255 1256 store i4 %add8, i4 addrspace(1)* %dst, align 4 1257 ret void 1258} 1259 1260; TODO: Currently, permutation of udot8 is turned off due to a huge increase 1261; in the compile time. 1262define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1, 1263; GFX7-LABEL: udot8_CommutationInsideMAD: 1264; GFX7: ; %bb.0: ; %entry 1265; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1266; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1267; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1268; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1269; GFX7-NEXT: s_mov_b32 s22, -1 1270; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 1271; GFX7-NEXT: s_add_u32 s20, s20, s3 1272; GFX7-NEXT: s_mov_b32 s3, 0xf000 1273; GFX7-NEXT: s_mov_b32 s2, -1 1274; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1275; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 1276; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1277; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1278; GFX7-NEXT: s_addc_u32 s21, s21, 0 1279; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1280; GFX7-NEXT: s_lshr_b32 s6, s4, 28 1281; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 1282; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 1283; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 1284; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c 1285; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 1286; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 1287; GFX7-NEXT: s_lshr_b32 s13, s5, 28 1288; GFX7-NEXT: s_and_b32 s5, s5, 15 1289; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 1290; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 1291; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 1292; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c 1293; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 1294; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 1295; GFX7-NEXT: s_and_b32 s4, s4, 15 1296; GFX7-NEXT: v_mov_b32_e32 v1, s5 1297; GFX7-NEXT: v_mov_b32_e32 v2, s19 1298; GFX7-NEXT: v_mov_b32_e32 v3, s18 1299; GFX7-NEXT: v_mov_b32_e32 v4, s17 1300; GFX7-NEXT: v_mov_b32_e32 v5, s16 1301; GFX7-NEXT: v_mov_b32_e32 v6, s15 1302; GFX7-NEXT: v_mov_b32_e32 v7, s14 1303; GFX7-NEXT: s_waitcnt vmcnt(0) 1304; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 1305; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 1306; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 1307; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 1308; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 1309; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 1310; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 1311; GFX7-NEXT: v_mov_b32_e32 v1, s13 1312; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 1313; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 1314; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 1315; GFX7-NEXT: s_endpgm 1316; 1317; GFX8-LABEL: udot8_CommutationInsideMAD: 1318; GFX8: ; %bb.0: ; %entry 1319; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1320; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1321; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 1322; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 1323; GFX8-NEXT: s_mov_b32 s18, -1 1324; GFX8-NEXT: s_mov_b32 s19, 0xe80000 1325; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1326; GFX8-NEXT: v_mov_b32_e32 v0, s0 1327; GFX8-NEXT: v_mov_b32_e32 v1, s1 1328; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 1329; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1330; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 1331; GFX8-NEXT: s_add_u32 s16, s16, s3 1332; GFX8-NEXT: s_addc_u32 s17, s17, 0 1333; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1334; GFX8-NEXT: s_and_b32 s8, s0, 15 1335; GFX8-NEXT: s_and_b32 s15, s1, 15 1336; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 1337; GFX8-NEXT: v_mov_b32_e32 v4, s15 1338; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 1339; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 1340; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 1341; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 1342; GFX8-NEXT: s_lshr_b32 s9, s1, 28 1343; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c 1344; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 1345; GFX8-NEXT: v_mov_b32_e32 v5, s14 1346; GFX8-NEXT: s_lshr_b32 s2, s0, 28 1347; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 1348; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 1349; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 1350; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 1351; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c 1352; GFX8-NEXT: v_mov_b32_e32 v3, s1 1353; GFX8-NEXT: v_mov_b32_e32 v6, s13 1354; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 1355; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 1356; GFX8-NEXT: v_mov_b32_e32 v7, s12 1357; GFX8-NEXT: v_mov_b32_e32 v8, s11 1358; GFX8-NEXT: v_mov_b32_e32 v9, s10 1359; GFX8-NEXT: s_waitcnt vmcnt(0) 1360; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 1361; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 1362; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 1363; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 1364; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 1365; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 1366; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 1367; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 1368; GFX8-NEXT: v_mov_b32_e32 v3, s9 1369; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 1370; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 1371; GFX8-NEXT: flat_store_byte v[0:1], v2 1372; GFX8-NEXT: s_endpgm 1373; 1374; GFX9-LABEL: udot8_CommutationInsideMAD: 1375; GFX9: ; %bb.0: ; %entry 1376; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1377; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1378; GFX9-NEXT: v_mov_b32_e32 v0, 0 1379; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1380; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1381; GFX9-NEXT: s_mov_b32 s22, -1 1382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] 1384; GFX9-NEXT: s_mov_b32 s23, 0xe00000 1385; GFX9-NEXT: s_add_u32 s20, s20, s3 1386; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 1387; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 1388; GFX9-NEXT: s_addc_u32 s21, s21, 0 1389; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1390; GFX9-NEXT: s_and_b32 s10, s2, 15 1391; GFX9-NEXT: s_and_b32 s17, s3, 15 1392; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 1393; GFX9-NEXT: v_mov_b32_e32 v3, s17 1394; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 1395; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 1396; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 1397; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 1398; GFX9-NEXT: s_lshr_b32 s11, s3, 28 1399; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c 1400; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 1401; GFX9-NEXT: v_mov_b32_e32 v4, s16 1402; GFX9-NEXT: s_lshr_b32 s4, s2, 28 1403; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 1404; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 1405; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 1406; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 1407; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c 1408; GFX9-NEXT: v_mov_b32_e32 v2, s3 1409; GFX9-NEXT: v_mov_b32_e32 v5, s15 1410; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 1411; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 1412; GFX9-NEXT: v_mov_b32_e32 v6, s14 1413; GFX9-NEXT: v_mov_b32_e32 v7, s13 1414; GFX9-NEXT: v_mov_b32_e32 v8, s12 1415; GFX9-NEXT: s_waitcnt vmcnt(0) 1416; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 1417; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 1418; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 1419; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1420; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 1421; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 1422; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 1423; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 1424; GFX9-NEXT: v_mov_b32_e32 v2, s11 1425; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1426; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 1427; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 1428; GFX9-NEXT: s_endpgm 1429; 1430; GFX9-DL-LABEL: udot8_CommutationInsideMAD: 1431; GFX9-DL: ; %bb.0: ; %entry 1432; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1433; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1434; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1435; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1436; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1437; GFX9-DL-NEXT: s_mov_b32 s22, -1 1438; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1439; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 1440; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 1441; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 1442; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1443; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 1444; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 1445; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1446; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 1447; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 1448; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 1449; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 1450; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 1451; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 1452; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 1453; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 1454; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 1455; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c 1456; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 1457; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 1458; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 1459; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 1460; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 1461; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 1462; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 1463; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c 1464; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 1465; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 1466; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 1467; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 1468; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 1469; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 1470; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 1471; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1472; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 1473; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 1474; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 1475; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1476; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 1477; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 1478; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 1479; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 1480; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 1481; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1482; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 1483; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 1484; GFX9-DL-NEXT: s_endpgm 1485; 1486; GFX10-DL-LABEL: udot8_CommutationInsideMAD: 1487; GFX10-DL: ; %bb.0: ; %entry 1488; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1489; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1490; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1491; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1492; GFX10-DL-NEXT: s_mov_b32 s10, -1 1493; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 1494; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 1495; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1496; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 1497; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1498; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 1499; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1500; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1501; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1502; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 1503; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 1504; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 1505; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 1506; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1507; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1508; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 1509; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 1510; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1511; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c 1512; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c 1513; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s2, s3 1514; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 1515; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 1516; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 1517; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 1518; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 1519; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 1520; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1521; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 1522; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 1523; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1524; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 1525; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 1526; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 1527; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 1528; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1529; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 1530; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 1531; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 1532; GFX10-DL-NEXT: s_endpgm 1533 <8 x i4> addrspace(1)* %src2, 1534 i4 addrspace(1)* nocapture %dst) { 1535entry: 1536 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 1537 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 1538 1539 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1540 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1541 %mul0 = mul nuw nsw i4 %v1e0, %v2e0 1542 1543 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1544 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1545 %mul1 = mul nuw nsw i4 %v1e1, %v2e1 1546 1547 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1548 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1549 %mul2 = mul nuw nsw i4 %v1e2, %v2e2 1550 1551 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1552 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1553 %mul3 = mul nuw nsw i4 %v1e3, %v2e3 1554 1555 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1556 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1557 %mul4 = mul nuw nsw i4 %v1e4, %v2e4 1558 1559 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1560 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1561 %mul5 = mul nuw nsw i4 %v1e5, %v2e5 1562 1563 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1564 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1565 %mul6 = mul nuw nsw i4 %v1e6, %v2e6 1566 1567 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1568 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1569 %mul7 = mul nuw nsw i4 %v1e7, %v2e7 1570 1571 %acc = load i4, i4 addrspace(1)* %dst, align 4 1572 %add1 = add i4 %mul0, %acc 1573 %add2 = add i4 %mul1, %add1 1574 %add3 = add i4 %mul2, %add2 1575 %add4 = add i4 %mul3, %add3 1576 %add5 = add i4 %mul4, %add4 1577 %add6 = add i4 %mul5, %add5 1578 %add7 = add i4 %mul6, %add6 1579 %add8 = add i4 %mul7, %add7 1580 1581 store i4 %add8, i4 addrspace(1)* %dst, align 4 1582 ret void 1583} 1584 1585define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, 1586; GFX7-LABEL: udot8_multiuses_mul1: 1587; GFX7: ; %bb.0: ; %entry 1588; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1589; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1590; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1591; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1592; GFX7-NEXT: s_mov_b32 s26, -1 1593; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 1595; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 1596; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1597; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 1598; GFX7-NEXT: s_add_u32 s24, s24, s3 1599; GFX7-NEXT: s_addc_u32 s25, s25, 0 1600; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 1602; GFX7-NEXT: s_lshr_b32 s7, s6, 28 1603; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 1604; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 1605; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 1606; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c 1607; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 1608; GFX7-NEXT: s_and_b32 s6, s6, 15 1609; GFX7-NEXT: s_lshr_b32 s5, s4, 28 1610; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 1611; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 1612; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 1613; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c 1614; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 1615; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 1616; GFX7-NEXT: s_and_b32 s4, s4, 15 1617; GFX7-NEXT: v_mov_b32_e32 v0, s6 1618; GFX7-NEXT: v_mov_b32_e32 v1, s20 1619; GFX7-NEXT: v_mad_u32_u24 v1, s4, v0, v1 1620; GFX7-NEXT: v_mov_b32_e32 v2, s19 1621; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 1622; GFX7-NEXT: v_mad_u32_u24 v1, s13, v2, v1 1623; GFX7-NEXT: v_mov_b32_e32 v2, s18 1624; GFX7-NEXT: v_mad_u32_u24 v1, s12, v2, v1 1625; GFX7-NEXT: v_mov_b32_e32 v2, s17 1626; GFX7-NEXT: v_mad_u32_u24 v1, s11, v2, v1 1627; GFX7-NEXT: v_mov_b32_e32 v2, s16 1628; GFX7-NEXT: v_mad_u32_u24 v1, s10, v2, v1 1629; GFX7-NEXT: v_mov_b32_e32 v2, s15 1630; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 1631; GFX7-NEXT: v_mov_b32_e32 v2, s14 1632; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 1633; GFX7-NEXT: v_mov_b32_e32 v2, s7 1634; GFX7-NEXT: v_mad_u32_u24 v1, s5, v2, v1 1635; GFX7-NEXT: s_mov_b32 s3, 0xf000 1636; GFX7-NEXT: s_mov_b32 s2, -1 1637; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 1638; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1639; GFX7-NEXT: s_endpgm 1640; 1641; GFX8-LABEL: udot8_multiuses_mul1: 1642; GFX8: ; %bb.0: ; %entry 1643; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1644; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1645; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1646; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1647; GFX8-NEXT: s_mov_b32 s22, -1 1648; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1649; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 1650; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 1651; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 1652; GFX8-NEXT: s_mov_b32 s23, 0xe80000 1653; GFX8-NEXT: s_add_u32 s20, s20, s3 1654; GFX8-NEXT: s_addc_u32 s21, s21, 0 1655; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1656; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 1657; GFX8-NEXT: s_lshr_b32 s7, s6, 28 1658; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 1659; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 1660; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 1661; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c 1662; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 1663; GFX8-NEXT: s_and_b32 s6, s6, 15 1664; GFX8-NEXT: s_lshr_b32 s3, s2, 28 1665; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 1666; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 1667; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 1668; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c 1669; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 1670; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 1671; GFX8-NEXT: s_and_b32 s2, s2, 15 1672; GFX8-NEXT: v_mov_b32_e32 v0, s6 1673; GFX8-NEXT: v_mov_b32_e32 v1, s18 1674; GFX8-NEXT: v_mad_u32_u24 v1, s2, v0, v1 1675; GFX8-NEXT: v_mov_b32_e32 v2, s17 1676; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 1677; GFX8-NEXT: v_mad_u32_u24 v1, s11, v2, v1 1678; GFX8-NEXT: v_mov_b32_e32 v2, s16 1679; GFX8-NEXT: v_mad_u32_u24 v1, s10, v2, v1 1680; GFX8-NEXT: v_mov_b32_e32 v2, s15 1681; GFX8-NEXT: v_mad_u32_u24 v1, s9, v2, v1 1682; GFX8-NEXT: v_mov_b32_e32 v2, s14 1683; GFX8-NEXT: v_mad_u32_u24 v1, s8, v2, v1 1684; GFX8-NEXT: v_mov_b32_e32 v2, s13 1685; GFX8-NEXT: v_mad_u32_u24 v1, s5, v2, v1 1686; GFX8-NEXT: v_mov_b32_e32 v2, s12 1687; GFX8-NEXT: v_mad_u32_u24 v1, s4, v2, v1 1688; GFX8-NEXT: v_mov_b32_e32 v2, s7 1689; GFX8-NEXT: v_mad_u32_u24 v1, s3, v2, v1 1690; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 1691; GFX8-NEXT: v_mov_b32_e32 v0, s0 1692; GFX8-NEXT: v_mov_b32_e32 v1, s1 1693; GFX8-NEXT: flat_store_dword v[0:1], v2 1694; GFX8-NEXT: s_endpgm 1695; 1696; GFX9-LABEL: udot8_multiuses_mul1: 1697; GFX9: ; %bb.0: ; %entry 1698; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1699; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1700; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1701; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1702; GFX9-NEXT: s_mov_b32 s22, -1 1703; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1704; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 1705; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 1706; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 1707; GFX9-NEXT: s_mov_b32 s23, 0xe00000 1708; GFX9-NEXT: s_add_u32 s20, s20, s3 1709; GFX9-NEXT: s_addc_u32 s21, s21, 0 1710; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1711; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 1712; GFX9-NEXT: s_lshr_b32 s7, s6, 28 1713; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 1714; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 1715; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 1716; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c 1717; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 1718; GFX9-NEXT: s_and_b32 s6, s6, 15 1719; GFX9-NEXT: s_lshr_b32 s3, s2, 28 1720; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 1721; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 1722; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 1723; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c 1724; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 1725; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 1726; GFX9-NEXT: s_and_b32 s2, s2, 15 1727; GFX9-NEXT: v_mov_b32_e32 v1, s6 1728; GFX9-NEXT: v_mov_b32_e32 v2, s18 1729; GFX9-NEXT: v_mad_u32_u24 v2, s2, v1, v2 1730; GFX9-NEXT: v_mov_b32_e32 v3, s17 1731; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 1732; GFX9-NEXT: v_mad_u32_u24 v2, s11, v3, v2 1733; GFX9-NEXT: v_mov_b32_e32 v3, s16 1734; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 1735; GFX9-NEXT: v_mov_b32_e32 v3, s15 1736; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 1737; GFX9-NEXT: v_mov_b32_e32 v3, s14 1738; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 1739; GFX9-NEXT: v_mov_b32_e32 v3, s13 1740; GFX9-NEXT: v_mad_u32_u24 v2, s5, v3, v2 1741; GFX9-NEXT: v_mov_b32_e32 v3, s12 1742; GFX9-NEXT: v_mad_u32_u24 v2, s4, v3, v2 1743; GFX9-NEXT: v_mov_b32_e32 v3, s7 1744; GFX9-NEXT: v_mad_u32_u24 v2, s3, v3, v2 1745; GFX9-NEXT: v_mov_b32_e32 v0, 0 1746; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 1747; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1748; GFX9-NEXT: s_endpgm 1749; 1750; GFX9-DL-LABEL: udot8_multiuses_mul1: 1751; GFX9-DL: ; %bb.0: ; %entry 1752; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1753; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1754; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1755; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1756; GFX9-DL-NEXT: s_mov_b32 s22, -1 1757; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1758; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 1759; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 1760; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1761; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 1762; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 1763; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 1764; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1765; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40004 1766; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 1767; GFX9-DL-NEXT: s_bfe_u32 s12, s6, 0x40018 1768; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40014 1769; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 1770; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x4000c 1771; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 1772; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 1773; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 28 1774; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 1775; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 1776; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 1777; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c 1778; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 1779; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004 1780; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 1781; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 1782; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 1783; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 1784; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 1785; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 1786; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v3, v2 1787; GFX9-DL-NEXT: v_mov_b32_e32 v3, s16 1788; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v3, v2 1789; GFX9-DL-NEXT: v_mov_b32_e32 v3, s15 1790; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v3, v2 1791; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 1792; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v3, v2 1793; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 1794; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 1795; GFX9-DL-NEXT: v_mov_b32_e32 v3, s12 1796; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 1797; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 1798; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 1799; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1800; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 1801; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1802; GFX9-DL-NEXT: s_endpgm 1803; 1804; GFX10-DL-LABEL: udot8_multiuses_mul1: 1805; GFX10-DL: ; %bb.0: ; %entry 1806; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1807; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1808; GFX10-DL-NEXT: s_mov_b32 s10, -1 1809; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 1810; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 1811; GFX10-DL-NEXT: s_clause 0x1 1812; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1813; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1814; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 1815; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1816; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1817; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1818; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1819; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1820; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1821; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1822; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 1823; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 1824; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 1825; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 1826; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 1827; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v0 1828; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 1829; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 1830; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 1831; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 1832; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c 1833; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c 1834; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 1835; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 1836; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 1837; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 1838; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 1839; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 1840; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 1841; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 1842; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40018 1843; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 1844; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 1845; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 1846; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 1847; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 1848; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] 1849; GFX10-DL-NEXT: s_endpgm 1850 <8 x i4> addrspace(1)* %src2, 1851 i32 addrspace(1)* nocapture %dst) { 1852entry: 1853 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 1854 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 1855 1856 %v1e0 = extractelement <8 x i4> %vec1, i64 0 1857 %cv1e0 = zext i4 %v1e0 to i32 1858 %v2e0 = extractelement <8 x i4> %vec2, i64 0 1859 %cv2e0 = zext i4 %v2e0 to i32 1860 %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 1861 1862 %v1e1 = extractelement <8 x i4> %vec1, i64 1 1863 %cv1e1 = zext i4 %v1e1 to i32 1864 %v2e1 = extractelement <8 x i4> %vec2, i64 1 1865 %cv2e1 = zext i4 %v2e1 to i32 1866 %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 1867 1868 %v1e2 = extractelement <8 x i4> %vec1, i64 2 1869 %cv1e2 = zext i4 %v1e2 to i32 1870 %v2e2 = extractelement <8 x i4> %vec2, i64 2 1871 %cv2e2 = zext i4 %v2e2 to i32 1872 %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 1873 1874 %v1e3 = extractelement <8 x i4> %vec1, i64 3 1875 %cv1e3 = zext i4 %v1e3 to i32 1876 %v2e3 = extractelement <8 x i4> %vec2, i64 3 1877 %cv2e3 = zext i4 %v2e3 to i32 1878 %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 1879 1880 %v1e4 = extractelement <8 x i4> %vec1, i64 4 1881 %cv1e4 = zext i4 %v1e4 to i32 1882 %v2e4 = extractelement <8 x i4> %vec2, i64 4 1883 %cv2e4 = zext i4 %v2e4 to i32 1884 %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 1885 1886 %v1e5 = extractelement <8 x i4> %vec1, i64 5 1887 %cv1e5 = zext i4 %v1e5 to i32 1888 %v2e5 = extractelement <8 x i4> %vec2, i64 5 1889 %cv2e5 = zext i4 %v2e5 to i32 1890 %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 1891 1892 %v1e6 = extractelement <8 x i4> %vec1, i64 6 1893 %cv1e6 = zext i4 %v1e6 to i32 1894 %v2e6 = extractelement <8 x i4> %vec2, i64 6 1895 %cv2e6 = zext i4 %v2e6 to i32 1896 %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 1897 1898 %v1e7 = extractelement <8 x i4> %vec1, i64 7 1899 %cv1e7 = zext i4 %v1e7 to i32 1900 %v2e7 = extractelement <8 x i4> %vec2, i64 7 1901 %cv2e7 = zext i4 %v2e7 to i32 1902 %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 1903 1904 %acc = load i32, i32 addrspace(1)* %dst, align 4 1905 %add1 = add i32 %mul0, %acc 1906 %add = add i32 %mul0, %add1 1907 %add2 = add i32 %add1, %mul1 1908 %add3 = add i32 %add2, %mul2 1909 %add4 = add i32 %add3, %mul3 1910 %add5 = add i32 %add4, %mul4 1911 %add6 = add i32 %add5, %mul5 1912 %add7 = add i32 %add6, %mul6 1913 %add8 = add i32 %add7, %mul7 1914 1915 %res = add i32 %add, %add8 1916 store i32 %res, i32 addrspace(1)* %dst, align 4 1917 ret void 1918} 1919 1920define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, 1921; GFX7-LABEL: udot8_acc32_vecMul: 1922; GFX7: ; %bb.0: ; %entry 1923; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1924; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1925; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1926; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1927; GFX7-NEXT: s_mov_b32 s26, -1 1928; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1929; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 1930; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 1931; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1932; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 1933; GFX7-NEXT: s_add_u32 s24, s24, s3 1934; GFX7-NEXT: s_addc_u32 s25, s25, 0 1935; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1936; GFX7-NEXT: s_lshr_b32 s7, s6, 28 1937; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 1938; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 1939; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 1940; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c 1941; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 1942; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 1943; GFX7-NEXT: s_and_b32 s6, s6, 15 1944; GFX7-NEXT: s_lshr_b32 s5, s4, 28 1945; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 1946; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 1947; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 1948; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c 1949; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 1950; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 1951; GFX7-NEXT: s_and_b32 s4, s4, 15 1952; GFX7-NEXT: v_mov_b32_e32 v0, s6 1953; GFX7-NEXT: v_mov_b32_e32 v1, s20 1954; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 1955; GFX7-NEXT: v_mov_b32_e32 v1, s19 1956; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 1957; GFX7-NEXT: v_mov_b32_e32 v1, s18 1958; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 1959; GFX7-NEXT: v_mov_b32_e32 v1, s17 1960; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 1961; GFX7-NEXT: v_mov_b32_e32 v1, s16 1962; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 1963; GFX7-NEXT: v_mov_b32_e32 v1, s15 1964; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 1965; GFX7-NEXT: v_mov_b32_e32 v1, s14 1966; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 1967; GFX7-NEXT: v_mov_b32_e32 v1, s7 1968; GFX7-NEXT: s_mov_b32 s3, 0xf000 1969; GFX7-NEXT: s_mov_b32 s2, -1 1970; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 1971; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1972; GFX7-NEXT: s_endpgm 1973; 1974; GFX8-LABEL: udot8_acc32_vecMul: 1975; GFX8: ; %bb.0: ; %entry 1976; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1977; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1978; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 1979; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 1980; GFX8-NEXT: s_mov_b32 s22, -1 1981; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1982; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 1983; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 1984; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 1985; GFX8-NEXT: s_mov_b32 s23, 0xe80000 1986; GFX8-NEXT: s_add_u32 s20, s20, s3 1987; GFX8-NEXT: s_addc_u32 s21, s21, 0 1988; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1989; GFX8-NEXT: s_lshr_b32 s7, s6, 28 1990; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 1991; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 1992; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 1993; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c 1994; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 1995; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 1996; GFX8-NEXT: s_and_b32 s6, s6, 15 1997; GFX8-NEXT: s_lshr_b32 s3, s2, 28 1998; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 1999; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 2000; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 2001; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c 2002; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 2003; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 2004; GFX8-NEXT: s_and_b32 s2, s2, 15 2005; GFX8-NEXT: v_mov_b32_e32 v0, s6 2006; GFX8-NEXT: v_mov_b32_e32 v1, s18 2007; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 2008; GFX8-NEXT: v_mov_b32_e32 v1, s17 2009; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 2010; GFX8-NEXT: v_mov_b32_e32 v1, s16 2011; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 2012; GFX8-NEXT: v_mov_b32_e32 v1, s15 2013; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 2014; GFX8-NEXT: v_mov_b32_e32 v1, s14 2015; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 2016; GFX8-NEXT: v_mov_b32_e32 v1, s13 2017; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 2018; GFX8-NEXT: v_mov_b32_e32 v1, s12 2019; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 2020; GFX8-NEXT: v_mov_b32_e32 v1, s7 2021; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 2022; GFX8-NEXT: v_mov_b32_e32 v0, s0 2023; GFX8-NEXT: v_mov_b32_e32 v1, s1 2024; GFX8-NEXT: flat_store_dword v[0:1], v2 2025; GFX8-NEXT: s_endpgm 2026; 2027; GFX9-LABEL: udot8_acc32_vecMul: 2028; GFX9: ; %bb.0: ; %entry 2029; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2030; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2031; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2032; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2033; GFX9-NEXT: s_mov_b32 s22, -1 2034; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2035; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 2036; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 2037; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 2038; GFX9-NEXT: s_mov_b32 s23, 0xe00000 2039; GFX9-NEXT: s_add_u32 s20, s20, s3 2040; GFX9-NEXT: s_addc_u32 s21, s21, 0 2041; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2042; GFX9-NEXT: s_lshr_b32 s7, s6, 28 2043; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 2044; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 2045; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 2046; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c 2047; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 2048; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 2049; GFX9-NEXT: s_and_b32 s6, s6, 15 2050; GFX9-NEXT: s_lshr_b32 s3, s2, 28 2051; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 2052; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 2053; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 2054; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c 2055; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 2056; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 2057; GFX9-NEXT: s_and_b32 s2, s2, 15 2058; GFX9-NEXT: v_mov_b32_e32 v1, s6 2059; GFX9-NEXT: v_mov_b32_e32 v2, s18 2060; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 2061; GFX9-NEXT: v_mov_b32_e32 v2, s17 2062; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 2063; GFX9-NEXT: v_mov_b32_e32 v2, s16 2064; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 2065; GFX9-NEXT: v_mov_b32_e32 v2, s15 2066; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 2067; GFX9-NEXT: v_mov_b32_e32 v2, s14 2068; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 2069; GFX9-NEXT: v_mov_b32_e32 v2, s13 2070; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 2071; GFX9-NEXT: v_mov_b32_e32 v2, s12 2072; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 2073; GFX9-NEXT: v_mov_b32_e32 v2, s7 2074; GFX9-NEXT: v_mov_b32_e32 v0, 0 2075; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 2076; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2077; GFX9-NEXT: s_endpgm 2078; 2079; GFX9-DL-LABEL: udot8_acc32_vecMul: 2080; GFX9-DL: ; %bb.0: ; %entry 2081; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2082; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2083; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2084; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2085; GFX9-DL-NEXT: s_mov_b32 s10, -1 2086; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 2087; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 2088; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2089; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 2090; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 2091; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 2092; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2093; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 2094; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2095; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 2096; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 2097; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 2098; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 2099; GFX9-DL-NEXT: s_endpgm 2100; 2101; GFX10-DL-LABEL: udot8_acc32_vecMul: 2102; GFX10-DL: ; %bb.0: ; %entry 2103; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2104; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2105; GFX10-DL-NEXT: s_mov_b32 s10, -1 2106; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 2107; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 2108; GFX10-DL-NEXT: s_clause 0x1 2109; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2110; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2111; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2112; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 2113; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2114; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 2115; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2116; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2117; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2118; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 2119; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 2120; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 2121; GFX10-DL-NEXT: s_endpgm 2122 <8 x i4> addrspace(1)* %src2, 2123 i32 addrspace(1)* nocapture %dst) { 2124entry: 2125 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 2126 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 2127 2128 %cvec1 = zext <8 x i4> %vec1 to <8 x i32> 2129 %cvec2 = zext <8 x i4> %vec2 to <8 x i32> 2130 2131 %mul = mul <8 x i32> %cvec1, %cvec2 2132 %mul0 = extractelement <8 x i32> %mul, i64 0 2133 %mul1 = extractelement <8 x i32> %mul, i64 1 2134 %mul2 = extractelement <8 x i32> %mul, i64 2 2135 %mul3 = extractelement <8 x i32> %mul, i64 3 2136 %mul4 = extractelement <8 x i32> %mul, i64 4 2137 %mul5 = extractelement <8 x i32> %mul, i64 5 2138 %mul6 = extractelement <8 x i32> %mul, i64 6 2139 %mul7 = extractelement <8 x i32> %mul, i64 7 2140 2141 %acc = load i32, i32 addrspace(1)* %dst, align 4 2142 %add1 = add i32 %mul0, %acc 2143 %add2 = add i32 %add1, %mul1 2144 %add3 = add i32 %add2, %mul2 2145 %add4 = add i32 %add3, %mul3 2146 %add5 = add i32 %add4, %mul4 2147 %add6 = add i32 %add5, %mul5 2148 %add7 = add i32 %add6, %mul6 2149 %add8 = add i32 %add7, %mul7 2150 2151 store i32 %add8, i32 addrspace(1)* %dst, align 4 2152 ret void 2153} 2154 2155; TODO: Clean up the code(by default pk_mad_I16 should be generated), then 2156; support the pattern. 2157define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, 2158; GFX7-LABEL: udot8_acc16_vecMul: 2159; GFX7: ; %bb.0: ; %entry 2160; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2161; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2162; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2163; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2164; GFX7-NEXT: s_mov_b32 s22, -1 2165; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 2166; GFX7-NEXT: s_add_u32 s20, s20, s3 2167; GFX7-NEXT: s_mov_b32 s3, 0xf000 2168; GFX7-NEXT: s_mov_b32 s2, -1 2169; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2170; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 2171; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 2172; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 2173; GFX7-NEXT: s_addc_u32 s21, s21, 0 2174; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2175; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c 2176; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c 2177; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 2178; GFX7-NEXT: v_mov_b32_e32 v4, s17 2179; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 2180; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 2181; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 2182; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 2183; GFX7-NEXT: s_lshr_b32 s13, s5, 28 2184; GFX7-NEXT: s_and_b32 s5, s5, 15 2185; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 2186; GFX7-NEXT: v_mov_b32_e32 v2, s19 2187; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 2188; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 2189; GFX7-NEXT: s_lshr_b32 s6, s4, 28 2190; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 2191; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 2192; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 2193; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 2194; GFX7-NEXT: v_mov_b32_e32 v3, s18 2195; GFX7-NEXT: s_and_b32 s4, s4, 15 2196; GFX7-NEXT: v_mov_b32_e32 v1, s5 2197; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1 2198; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2199; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 2200; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 2201; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 2202; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 2203; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 2204; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 2205; GFX7-NEXT: v_mov_b32_e32 v5, s16 2206; GFX7-NEXT: v_mov_b32_e32 v6, s15 2207; GFX7-NEXT: v_mov_b32_e32 v7, s14 2208; GFX7-NEXT: s_waitcnt vmcnt(0) 2209; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2210; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2211; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 2212; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 2213; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 2214; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 2215; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 2216; GFX7-NEXT: v_mov_b32_e32 v1, s13 2217; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 2218; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 2219; GFX7-NEXT: s_endpgm 2220; 2221; GFX8-LABEL: udot8_acc16_vecMul: 2222; GFX8: ; %bb.0: ; %entry 2223; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2224; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2225; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 2226; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 2227; GFX8-NEXT: s_mov_b32 s18, -1 2228; GFX8-NEXT: s_mov_b32 s19, 0xe80000 2229; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2230; GFX8-NEXT: v_mov_b32_e32 v0, s0 2231; GFX8-NEXT: v_mov_b32_e32 v1, s1 2232; GFX8-NEXT: flat_load_ushort v2, v[0:1] 2233; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2234; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 2235; GFX8-NEXT: s_add_u32 s16, s16, s3 2236; GFX8-NEXT: s_addc_u32 s17, s17, 0 2237; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2238; GFX8-NEXT: s_lshr_b32 s2, s0, 28 2239; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 2240; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 2241; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 2242; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c 2243; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 2244; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 2245; GFX8-NEXT: s_lshr_b32 s9, s1, 28 2246; GFX8-NEXT: s_and_b32 s1, s1, 15 2247; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 2248; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 2249; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 2250; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c 2251; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 2252; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 2253; GFX8-NEXT: s_and_b32 s0, s0, 15 2254; GFX8-NEXT: v_mov_b32_e32 v3, s1 2255; GFX8-NEXT: v_mov_b32_e32 v4, s15 2256; GFX8-NEXT: v_mov_b32_e32 v5, s14 2257; GFX8-NEXT: v_mov_b32_e32 v6, s13 2258; GFX8-NEXT: v_mov_b32_e32 v7, s12 2259; GFX8-NEXT: v_mov_b32_e32 v8, s11 2260; GFX8-NEXT: v_mov_b32_e32 v9, s10 2261; GFX8-NEXT: s_waitcnt vmcnt(0) 2262; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 2263; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 2264; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 2265; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 2266; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 2267; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 2268; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 2269; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 2270; GFX8-NEXT: v_mov_b32_e32 v3, s9 2271; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 2272; GFX8-NEXT: flat_store_short v[0:1], v2 2273; GFX8-NEXT: s_endpgm 2274; 2275; GFX9-LABEL: udot8_acc16_vecMul: 2276; GFX9: ; %bb.0: ; %entry 2277; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2278; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2279; GFX9-NEXT: v_mov_b32_e32 v0, 0 2280; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2281; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2282; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2283; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 2284; GFX9-NEXT: global_load_ushort v5, v0, s[0:1] 2285; GFX9-NEXT: s_mov_b32 s22, -1 2286; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 2287; GFX9-NEXT: s_mov_b32 s23, 0xe00000 2288; GFX9-NEXT: s_add_u32 s20, s20, s3 2289; GFX9-NEXT: s_addc_u32 s21, s21, 0 2290; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2291; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 2292; GFX9-NEXT: s_lshr_b32 s12, s6, 28 2293; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 2294; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 2295; GFX9-NEXT: s_lshr_b32 s4, s2, 28 2296; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 2297; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 2298; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 2299; GFX9-NEXT: v_mov_b32_e32 v1, s7 2300; GFX9-NEXT: v_pk_mul_lo_u16 v1, s3, v1 2301; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14 2302; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 2303; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c 2304; GFX9-NEXT: s_and_b32 s17, s6, 15 2305; GFX9-NEXT: v_mov_b32_e32 v2, s3 2306; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 2307; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 2308; GFX9-NEXT: v_mov_b32_e32 v3, s3 2309; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 2310; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 2311; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 2312; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 2313; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c 2314; GFX9-NEXT: s_and_b32 s11, s2, 15 2315; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 2316; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 2317; GFX9-NEXT: v_mov_b32_e32 v4, s3 2318; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 2319; GFX9-NEXT: v_pk_mul_lo_u16 v4, s2, v4 2320; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v2 2321; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 2322; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3 2323; GFX9-NEXT: s_waitcnt vmcnt(0) 2324; GFX9-NEXT: v_add_u32_e32 v5, v4, v5 2325; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2326; GFX9-NEXT: v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 2327; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2328; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 2329; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2330; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 2331; GFX9-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2332; GFX9-NEXT: global_store_short v0, v1, s[0:1] 2333; GFX9-NEXT: s_endpgm 2334; 2335; GFX9-DL-LABEL: udot8_acc16_vecMul: 2336; GFX9-DL: ; %bb.0: ; %entry 2337; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2338; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2339; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2340; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2341; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2342; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2343; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 2344; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1] 2345; GFX9-DL-NEXT: s_mov_b32 s22, -1 2346; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 2347; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 2348; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 2349; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 2350; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2351; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 2352; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 2353; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s12 2354; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 2355; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 2356; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 2357; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 2358; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 2359; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 2360; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, s3, v1 2361; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14 2362; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 2363; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c 2364; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 2365; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 2366; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 2367; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 2368; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 2369; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 2370; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 2371; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 2372; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 2373; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c 2374; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 2375; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 2376; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 2377; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 2378; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 2379; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s2, v4 2380; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v2 2381; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 2382; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3 2383; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2384; GFX9-DL-NEXT: v_add_u32_e32 v5, v4, v5 2385; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2386; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 2387; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2388; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 2389; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2390; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v1 2391; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2392; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 2393; GFX9-DL-NEXT: s_endpgm 2394; 2395; GFX10-DL-LABEL: udot8_acc16_vecMul: 2396; GFX10-DL: ; %bb.0: ; %entry 2397; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2398; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2399; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2400; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2401; GFX10-DL-NEXT: s_mov_b32 s10, -1 2402; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 2403; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 2404; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2405; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 2406; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2407; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 2408; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2409; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2410; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2411; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 2412; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 2413; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 2414; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 2415; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s7 2416; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s6 2417; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 2418; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s3 2419; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c 2420; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 2421; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c 2422; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 2423; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 2424; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 2425; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s6 2426; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 2427; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 2428; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 2429; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 2430; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 2431; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 2432; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 2433; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s3, s1 2434; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2435; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 2436; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2437; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s6 2438; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 2439; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 2440; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 2441; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s0 2442; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2443; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s1 2444; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 2445; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2446; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 2447; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2448; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 2449; GFX10-DL-NEXT: s_endpgm 2450 <8 x i4> addrspace(1)* %src2, 2451 i16 addrspace(1)* nocapture %dst) { 2452entry: 2453 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 2454 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 2455 2456 %cvec1 = zext <8 x i4> %vec1 to <8 x i16> 2457 %cvec2 = zext <8 x i4> %vec2 to <8 x i16> 2458 2459 %mul = mul <8 x i16> %cvec1, %cvec2 2460 %mul0 = extractelement <8 x i16> %mul, i64 0 2461 %mul1 = extractelement <8 x i16> %mul, i64 1 2462 %mul2 = extractelement <8 x i16> %mul, i64 2 2463 %mul3 = extractelement <8 x i16> %mul, i64 3 2464 %mul4 = extractelement <8 x i16> %mul, i64 4 2465 %mul5 = extractelement <8 x i16> %mul, i64 5 2466 %mul6 = extractelement <8 x i16> %mul, i64 6 2467 %mul7 = extractelement <8 x i16> %mul, i64 7 2468 2469 %acc = load i16, i16 addrspace(1)* %dst, align 4 2470 %add1 = add i16 %mul0, %acc 2471 %add2 = add i16 %add1, %mul1 2472 %add3 = add i16 %add2, %mul2 2473 %add4 = add i16 %add3, %mul3 2474 %add5 = add i16 %add4, %mul4 2475 %add6 = add i16 %add5, %mul5 2476 %add7 = add i16 %add6, %mul6 2477 %add8 = add i16 %add7, %mul7 2478 2479 store i16 %add8, i16 addrspace(1)* %dst, align 4 2480 ret void 2481} 2482 2483; TODO: Cleanup the code to generate MAD; pattern should be recognized then. 2484define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, 2485; GFX7-LABEL: udot8_acc8_vecMul: 2486; GFX7: ; %bb.0: ; %entry 2487; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2488; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2489; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2490; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2491; GFX7-NEXT: s_mov_b32 s22, -1 2492; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 2493; GFX7-NEXT: s_add_u32 s20, s20, s3 2494; GFX7-NEXT: s_mov_b32 s3, 0xf000 2495; GFX7-NEXT: s_mov_b32 s2, -1 2496; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2497; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 2498; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 2499; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 2500; GFX7-NEXT: s_addc_u32 s21, s21, 0 2501; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2502; GFX7-NEXT: s_bfe_u32 s6, s4, 0x4000c 2503; GFX7-NEXT: s_bfe_u32 s13, s5, 0x4000c 2504; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40004 2505; GFX7-NEXT: s_lshr_b32 s17, s5, 28 2506; GFX7-NEXT: v_mov_b32_e32 v8, s13 2507; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40008 2508; GFX7-NEXT: s_and_b32 s16, s5, 15 2509; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40018 2510; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40014 2511; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40004 2512; GFX7-NEXT: v_mov_b32_e32 v6, s15 2513; GFX7-NEXT: s_lshr_b32 s10, s4, 28 2514; GFX7-NEXT: v_mov_b32_e32 v4, s17 2515; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 2516; GFX7-NEXT: v_mul_u32_u24_e32 v6, s8, v6 2517; GFX7-NEXT: v_mul_u32_u24_e32 v8, s6, v8 2518; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40010 2519; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40008 2520; GFX7-NEXT: v_mov_b32_e32 v7, s14 2521; GFX7-NEXT: s_and_b32 s9, s4, 15 2522; GFX7-NEXT: v_mov_b32_e32 v5, s16 2523; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40018 2524; GFX7-NEXT: v_mov_b32_e32 v3, s18 2525; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40014 2526; GFX7-NEXT: v_mov_b32_e32 v2, s19 2527; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 2528; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40010 2529; GFX7-NEXT: v_mov_b32_e32 v1, s5 2530; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 2531; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 2532; GFX7-NEXT: v_mul_u32_u24_e32 v5, s9, v5 2533; GFX7-NEXT: v_mul_u32_u24_e32 v7, s7, v7 2534; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 2535; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 2536; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 2537; GFX7-NEXT: v_or_b32_e32 v4, v5, v6 2538; GFX7-NEXT: v_or_b32_e32 v5, v7, v8 2539; GFX7-NEXT: v_mul_u32_u24_e32 v9, s4, v1 2540; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 2541; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 2542; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2543; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2544; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 2545; GFX7-NEXT: v_or_b32_e32 v3, v4, v5 2546; GFX7-NEXT: v_alignbit_b32 v4, v2, v3, 8 2547; GFX7-NEXT: v_alignbit_b32 v5, v2, v3, 16 2548; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 2549; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 2550; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2551; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 2552; GFX7-NEXT: s_waitcnt vmcnt(0) 2553; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 2554; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 2555; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 2556; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 2557; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 2558; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 2559; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 2560; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2561; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2562; GFX7-NEXT: s_endpgm 2563; 2564; GFX8-LABEL: udot8_acc8_vecMul: 2565; GFX8: ; %bb.0: ; %entry 2566; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2567; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2568; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2569; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2570; GFX8-NEXT: s_mov_b32 s22, -1 2571; GFX8-NEXT: s_mov_b32 s23, 0xe80000 2572; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2573; GFX8-NEXT: v_mov_b32_e32 v0, s0 2574; GFX8-NEXT: v_mov_b32_e32 v1, s1 2575; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 2576; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 2577; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 2578; GFX8-NEXT: s_add_u32 s20, s20, s3 2579; GFX8-NEXT: s_addc_u32 s21, s21, 0 2580; GFX8-NEXT: s_mov_b32 s0, 0xffff 2581; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2582; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 2583; GFX8-NEXT: s_bfe_u32 s9, s1, 0x4000c 2584; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40004 2585; GFX8-NEXT: s_and_b32 s15, s2, 15 2586; GFX8-NEXT: s_bfe_u32 s16, s2, 0x4000c 2587; GFX8-NEXT: s_bfe_u32 s3, s1, 0x40014 2588; GFX8-NEXT: s_lshr_b32 s5, s1, 28 2589; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40014 2590; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40010 2591; GFX8-NEXT: s_lshr_b32 s12, s2, 28 2592; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 2593; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40008 2594; GFX8-NEXT: s_and_b32 s8, s1, 15 2595; GFX8-NEXT: v_mov_b32_e32 v4, s16 2596; GFX8-NEXT: v_mov_b32_e32 v5, s9 2597; GFX8-NEXT: v_mov_b32_e32 v6, s15 2598; GFX8-NEXT: v_mov_b32_e32 v7, s14 2599; GFX8-NEXT: v_mov_b32_e32 v8, s7 2600; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2601; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v6 2602; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2603; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40010 2604; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40018 2605; GFX8-NEXT: v_mov_b32_e32 v9, s13 2606; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 2607; GFX8-NEXT: v_mov_b32_e32 v3, s2 2608; GFX8-NEXT: v_mov_b32_e32 v10, s12 2609; GFX8-NEXT: v_mov_b32_e32 v11, s5 2610; GFX8-NEXT: v_mov_b32_e32 v12, s11 2611; GFX8-NEXT: v_mov_b32_e32 v13, s10 2612; GFX8-NEXT: v_mov_b32_e32 v14, s3 2613; GFX8-NEXT: v_mul_u32_u24_e32 v3, s1, v3 2614; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 2615; GFX8-NEXT: v_mul_u32_u24_e32 v7, s6, v9 2616; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2617; GFX8-NEXT: v_mul_u32_u24_e32 v9, s4, v12 2618; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2619; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 2620; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2621; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 2622; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2623; GFX8-NEXT: v_and_b32_e32 v4, s0, v9 2624; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 2625; GFX8-NEXT: v_or_b32_e32 v6, v4, v7 2626; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 2627; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6 2628; GFX8-NEXT: s_waitcnt vmcnt(0) 2629; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 2630; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 2631; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 2632; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 2633; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2634; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 2635; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2636; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 2637; GFX8-NEXT: flat_store_byte v[0:1], v2 2638; GFX8-NEXT: s_endpgm 2639; 2640; GFX9-LABEL: udot8_acc8_vecMul: 2641; GFX9: ; %bb.0: ; %entry 2642; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2643; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2644; GFX9-NEXT: v_mov_b32_e32 v0, 0 2645; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2646; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2647; GFX9-NEXT: s_mov_b32 s22, -1 2648; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2649; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] 2650; GFX9-NEXT: s_mov_b32 s23, 0xe00000 2651; GFX9-NEXT: s_add_u32 s20, s20, s3 2652; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 2653; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 2654; GFX9-NEXT: s_addc_u32 s21, s21, 0 2655; GFX9-NEXT: s_mov_b32 s2, 0xffff 2656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2657; GFX9-NEXT: s_bfe_u32 s5, s3, 0x40010 2658; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40010 2659; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 2660; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 2661; GFX9-NEXT: s_lshr_b32 s15, s4, 28 2662; GFX9-NEXT: s_and_b32 s16, s4, 15 2663; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 2664; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 2665; GFX9-NEXT: v_mov_b32_e32 v2, s12 2666; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c 2667; GFX9-NEXT: s_bfe_u32 s6, s3, 0x40014 2668; GFX9-NEXT: v_mov_b32_e32 v3, s13 2669; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40018 2670; GFX9-NEXT: v_mov_b32_e32 v4, s14 2671; GFX9-NEXT: s_lshr_b32 s8, s3, 28 2672; GFX9-NEXT: v_mov_b32_e32 v5, s15 2673; GFX9-NEXT: s_and_b32 s9, s3, 15 2674; GFX9-NEXT: v_mov_b32_e32 v6, s16 2675; GFX9-NEXT: s_bfe_u32 s10, s3, 0x40004 2676; GFX9-NEXT: v_mov_b32_e32 v7, s17 2677; GFX9-NEXT: s_bfe_u32 s11, s3, 0x40008 2678; GFX9-NEXT: v_mov_b32_e32 v8, s18 2679; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c 2680; GFX9-NEXT: v_mov_b32_e32 v9, s4 2681; GFX9-NEXT: v_mul_lo_u16_e32 v2, s5, v2 2682; GFX9-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2683; GFX9-NEXT: v_mul_lo_u16_e32 v4, s7, v4 2684; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2685; GFX9-NEXT: v_mul_lo_u16_e32 v6, s9, v6 2686; GFX9-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2687; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 2688; GFX9-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2689; GFX9-NEXT: v_or_b32_e32 v4, v6, v7 2690; GFX9-NEXT: v_mul_lo_u16_e32 v8, s11, v8 2691; GFX9-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2692; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 2693; GFX9-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2694; GFX9-NEXT: v_or_b32_e32 v5, v4, v5 2695; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v5 2696; GFX9-NEXT: v_and_b32_e32 v2, s2, v2 2697; GFX9-NEXT: v_or_b32_e32 v3, v2, v3 2698; GFX9-NEXT: s_waitcnt vmcnt(0) 2699; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 2700; GFX9-NEXT: v_add_u32_e32 v1, v1, v6 2701; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 2702; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2703; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 2704; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 2705; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 2706; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2707; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2708; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 2709; GFX9-NEXT: s_endpgm 2710; 2711; GFX9-DL-LABEL: udot8_acc8_vecMul: 2712; GFX9-DL: ; %bb.0: ; %entry 2713; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2714; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2715; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2716; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2717; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2718; GFX9-DL-NEXT: s_mov_b32 s22, -1 2719; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2720; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 2721; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 2722; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 2723; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 2724; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 2725; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 2726; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff 2727; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2728; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x40010 2729; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 2730; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 2731; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 2732; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 28 2733; GFX9-DL-NEXT: s_and_b32 s16, s4, 15 2734; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 2735; GFX9-DL-NEXT: s_bfe_u32 s18, s4, 0x40008 2736; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 2737; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c 2738; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x40014 2739; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 2740; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x40018 2741; GFX9-DL-NEXT: v_mov_b32_e32 v4, s14 2742; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 28 2743; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 2744; GFX9-DL-NEXT: s_and_b32 s9, s3, 15 2745; GFX9-DL-NEXT: v_mov_b32_e32 v6, s16 2746; GFX9-DL-NEXT: s_bfe_u32 s10, s3, 0x40004 2747; GFX9-DL-NEXT: v_mov_b32_e32 v7, s17 2748; GFX9-DL-NEXT: s_bfe_u32 s11, s3, 0x40008 2749; GFX9-DL-NEXT: v_mov_b32_e32 v8, s18 2750; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c 2751; GFX9-DL-NEXT: v_mov_b32_e32 v9, s4 2752; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, s5, v2 2753; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2754; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, s7, v4 2755; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2756; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, s9, v6 2757; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2758; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v3 2759; GFX9-DL-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2760; GFX9-DL-NEXT: v_or_b32_e32 v4, v6, v7 2761; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, s11, v8 2762; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2763; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 2764; GFX9-DL-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2765; GFX9-DL-NEXT: v_or_b32_e32 v5, v4, v5 2766; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v5 2767; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2 2768; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v3 2769; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2770; GFX9-DL-NEXT: v_add_u32_e32 v1, v4, v1 2771; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v6 2772; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 2773; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2774; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 2775; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3 2776; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 2777; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2778; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2779; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 2780; GFX9-DL-NEXT: s_endpgm 2781; 2782; GFX10-DL-LABEL: udot8_acc8_vecMul: 2783; GFX10-DL: ; %bb.0: ; %entry 2784; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2785; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2786; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2787; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2788; GFX10-DL-NEXT: s_mov_b32 s14, -1 2789; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 2790; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 2791; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2792; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 2793; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2794; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 2795; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2796; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2797; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2798; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40004 2799; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 2800; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 2801; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s3, s7 2802; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 2803; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c 2804; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c 2805; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s3 2806; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s8, s7 2807; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 2808; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 2809; GFX10-DL-NEXT: s_bfe_u32 s2, s1, 0x40008 2810; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff 2811; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s2 2812; GFX10-DL-NEXT: v_or_b32_e32 v2, v3, v2 2813; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 2814; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 2815; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 2816; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 2817; GFX10-DL-NEXT: v_and_b32_e32 v2, s3, v2 2818; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2819; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8 2820; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40018 2821; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x40010 2822; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 2823; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 2824; GFX10-DL-NEXT: s_lshr_b32 s6, s1, 28 2825; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s9 2826; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s6 2827; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 2828; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 2829; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40018 2830; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s0 2831; GFX10-DL-NEXT: v_or_b32_e32 v4, v5, v4 2832; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6 2833; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4 2834; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2835; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 2836; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2837; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v7 2838; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2 2839; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 2840; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2841; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 2842; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v4 2843; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 2844; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2845; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2846; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 2847; GFX10-DL-NEXT: s_endpgm 2848 <8 x i4> addrspace(1)* %src2, 2849 i8 addrspace(1)* nocapture %dst) { 2850entry: 2851 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 2852 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 2853 2854 %cvec1 = zext <8 x i4> %vec1 to <8 x i8> 2855 %cvec2 = zext <8 x i4> %vec2 to <8 x i8> 2856 2857 %mul = mul <8 x i8> %cvec1, %cvec2 2858 %mul0 = extractelement <8 x i8> %mul, i64 0 2859 %mul1 = extractelement <8 x i8> %mul, i64 1 2860 %mul2 = extractelement <8 x i8> %mul, i64 2 2861 %mul3 = extractelement <8 x i8> %mul, i64 3 2862 %mul4 = extractelement <8 x i8> %mul, i64 4 2863 %mul5 = extractelement <8 x i8> %mul, i64 5 2864 %mul6 = extractelement <8 x i8> %mul, i64 6 2865 %mul7 = extractelement <8 x i8> %mul, i64 7 2866 2867 %acc = load i8, i8 addrspace(1)* %dst, align 4 2868 %add1 = add i8 %mul0, %acc 2869 %add2 = add i8 %add1, %mul1 2870 %add3 = add i8 %add2, %mul2 2871 %add4 = add i8 %add3, %mul3 2872 %add5 = add i8 %add4, %mul4 2873 %add6 = add i8 %add5, %mul5 2874 %add7 = add i8 %add6, %mul6 2875 %add8 = add i8 %add7, %mul7 2876 2877 store i8 %add8, i8 addrspace(1)* %dst, align 4 2878 ret void 2879} 2880 2881; TODO: Once the adictional "and+add" are removed, the pattern will be recognized. 2882define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, 2883; GFX7-LABEL: udot8_acc4_vecMul: 2884; GFX7: ; %bb.0: ; %entry 2885; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 2886; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2887; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2888; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 2889; GFX7-NEXT: s_mov_b32 s22, -1 2890; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 2891; GFX7-NEXT: s_add_u32 s20, s20, s3 2892; GFX7-NEXT: s_mov_b32 s3, 0xf000 2893; GFX7-NEXT: s_mov_b32 s2, -1 2894; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2895; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 2896; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 2897; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 2898; GFX7-NEXT: s_addc_u32 s21, s21, 0 2899; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2900; GFX7-NEXT: s_lshr_b32 s6, s4, 28 2901; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 2902; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 2903; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 2904; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c 2905; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 2906; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 2907; GFX7-NEXT: s_lshr_b32 s13, s5, 28 2908; GFX7-NEXT: s_and_b32 s5, s5, 15 2909; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 2910; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 2911; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 2912; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c 2913; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 2914; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 2915; GFX7-NEXT: s_and_b32 s4, s4, 15 2916; GFX7-NEXT: v_mov_b32_e32 v1, s5 2917; GFX7-NEXT: v_mov_b32_e32 v2, s19 2918; GFX7-NEXT: v_mov_b32_e32 v3, s18 2919; GFX7-NEXT: v_mov_b32_e32 v4, s17 2920; GFX7-NEXT: v_mov_b32_e32 v5, s16 2921; GFX7-NEXT: v_mov_b32_e32 v6, s15 2922; GFX7-NEXT: v_mov_b32_e32 v7, s14 2923; GFX7-NEXT: s_waitcnt vmcnt(0) 2924; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 2925; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 2926; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 2927; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 2928; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 2929; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 2930; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 2931; GFX7-NEXT: v_mov_b32_e32 v1, s13 2932; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 2933; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 2934; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 2935; GFX7-NEXT: s_endpgm 2936; 2937; GFX8-LABEL: udot8_acc4_vecMul: 2938; GFX8: ; %bb.0: ; %entry 2939; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2940; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2941; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 2942; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 2943; GFX8-NEXT: s_mov_b32 s18, -1 2944; GFX8-NEXT: s_mov_b32 s19, 0xe80000 2945; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2946; GFX8-NEXT: v_mov_b32_e32 v0, s0 2947; GFX8-NEXT: v_mov_b32_e32 v1, s1 2948; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 2949; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2950; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 2951; GFX8-NEXT: s_add_u32 s16, s16, s3 2952; GFX8-NEXT: s_addc_u32 s17, s17, 0 2953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2954; GFX8-NEXT: s_and_b32 s8, s0, 15 2955; GFX8-NEXT: s_and_b32 s15, s1, 15 2956; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 2957; GFX8-NEXT: v_mov_b32_e32 v4, s15 2958; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 2959; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 2960; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 2961; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 2962; GFX8-NEXT: s_lshr_b32 s9, s1, 28 2963; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c 2964; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 2965; GFX8-NEXT: v_mov_b32_e32 v5, s14 2966; GFX8-NEXT: s_lshr_b32 s2, s0, 28 2967; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 2968; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 2969; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 2970; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 2971; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c 2972; GFX8-NEXT: v_mov_b32_e32 v3, s1 2973; GFX8-NEXT: v_mov_b32_e32 v6, s13 2974; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 2975; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 2976; GFX8-NEXT: v_mov_b32_e32 v7, s12 2977; GFX8-NEXT: v_mov_b32_e32 v8, s11 2978; GFX8-NEXT: v_mov_b32_e32 v9, s10 2979; GFX8-NEXT: s_waitcnt vmcnt(0) 2980; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 2981; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 2982; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 2983; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 2984; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 2985; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 2986; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 2987; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 2988; GFX8-NEXT: v_mov_b32_e32 v3, s9 2989; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 2990; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 2991; GFX8-NEXT: flat_store_byte v[0:1], v2 2992; GFX8-NEXT: s_endpgm 2993; 2994; GFX9-LABEL: udot8_acc4_vecMul: 2995; GFX9: ; %bb.0: ; %entry 2996; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2997; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2998; GFX9-NEXT: v_mov_b32_e32 v0, 0 2999; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 3000; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 3001; GFX9-NEXT: s_mov_b32 s22, -1 3002; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3003; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] 3004; GFX9-NEXT: s_mov_b32 s23, 0xe00000 3005; GFX9-NEXT: s_add_u32 s20, s20, s3 3006; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 3007; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 3008; GFX9-NEXT: s_addc_u32 s21, s21, 0 3009; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3010; GFX9-NEXT: s_and_b32 s10, s2, 15 3011; GFX9-NEXT: s_and_b32 s17, s3, 15 3012; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 3013; GFX9-NEXT: v_mov_b32_e32 v3, s17 3014; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 3015; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 3016; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 3017; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 3018; GFX9-NEXT: s_lshr_b32 s11, s3, 28 3019; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c 3020; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 3021; GFX9-NEXT: v_mov_b32_e32 v4, s16 3022; GFX9-NEXT: s_lshr_b32 s4, s2, 28 3023; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 3024; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 3025; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 3026; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 3027; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c 3028; GFX9-NEXT: v_mov_b32_e32 v2, s3 3029; GFX9-NEXT: v_mov_b32_e32 v5, s15 3030; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 3031; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 3032; GFX9-NEXT: v_mov_b32_e32 v6, s14 3033; GFX9-NEXT: v_mov_b32_e32 v7, s13 3034; GFX9-NEXT: v_mov_b32_e32 v8, s12 3035; GFX9-NEXT: s_waitcnt vmcnt(0) 3036; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 3037; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 3038; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 3039; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 3040; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 3041; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 3042; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 3043; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 3044; GFX9-NEXT: v_mov_b32_e32 v2, s11 3045; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 3046; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 3047; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 3048; GFX9-NEXT: s_endpgm 3049; 3050; GFX9-DL-LABEL: udot8_acc4_vecMul: 3051; GFX9-DL: ; %bb.0: ; %entry 3052; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3053; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3054; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3055; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 3056; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 3057; GFX9-DL-NEXT: s_mov_b32 s22, -1 3058; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3059; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 3060; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 3061; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 3062; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 3063; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 3064; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 3065; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3066; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 3067; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 3068; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 3069; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 3070; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 3071; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 3072; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 3073; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 3074; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 3075; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c 3076; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 3077; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 3078; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 3079; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 3080; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 3081; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 3082; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 3083; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c 3084; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 3085; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 3086; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 3087; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 3088; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 3089; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 3090; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 3091; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3092; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 3093; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 3094; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 3095; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 3096; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 3097; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 3098; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 3099; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 3100; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 3101; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 3102; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 3103; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 3104; GFX9-DL-NEXT: s_endpgm 3105; 3106; GFX10-DL-LABEL: udot8_acc4_vecMul: 3107; GFX10-DL: ; %bb.0: ; %entry 3108; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 3109; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 3110; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 3111; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 3112; GFX10-DL-NEXT: s_mov_b32 s10, -1 3113; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 3114; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 3115; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3116; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 3117; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3118; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 3119; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 3120; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 3121; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3122; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 3123; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 3124; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 3125; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c 3126; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3127; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 3128; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 3129; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 3130; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 3131; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 3132; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c 3133; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7 3134; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 3135; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 3136; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 3137; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 3138; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 3139; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 3140; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 3141; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 3142; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 3143; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 3144; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 3145; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 3146; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 3147; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 3148; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 3149; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 3150; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 3151; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 3152; GFX10-DL-NEXT: s_endpgm 3153 <8 x i4> addrspace(1)* %src2, 3154 i4 addrspace(1)* nocapture %dst) { 3155entry: 3156 %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 3157 %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 3158 3159 %mul = mul <8 x i4> %vec1, %vec2 3160 %mul0 = extractelement <8 x i4> %mul, i64 0 3161 %mul1 = extractelement <8 x i4> %mul, i64 1 3162 %mul2 = extractelement <8 x i4> %mul, i64 2 3163 %mul3 = extractelement <8 x i4> %mul, i64 3 3164 %mul4 = extractelement <8 x i4> %mul, i64 4 3165 %mul5 = extractelement <8 x i4> %mul, i64 5 3166 %mul6 = extractelement <8 x i4> %mul, i64 6 3167 %mul7 = extractelement <8 x i4> %mul, i64 7 3168 3169 %acc = load i4, i4 addrspace(1)* %dst, align 4 3170 %add1 = add i4 %mul0, %acc 3171 %add2 = add i4 %add1, %mul1 3172 %add3 = add i4 %add2, %mul2 3173 %add4 = add i4 %add3, %mul3 3174 %add5 = add i4 %add4, %mul4 3175 %add6 = add i4 %add5, %mul5 3176 %add7 = add i4 %add6, %mul6 3177 %add8 = add i4 %add7, %mul7 3178 3179 store i4 %add8, i4 addrspace(1)* %dst, align 4 3180 ret void 3181} 3182 3183define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, 3184; GFX7-LABEL: udot8_variant1: 3185; GFX7: ; %bb.0: ; %entry 3186; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 3187; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3188; GFX7-NEXT: s_mov_b32 s3, 0xf000 3189; GFX7-NEXT: s_mov_b32 s2, -1 3190; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3191; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 3192; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 3193; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 3194; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3195; GFX7-NEXT: s_and_b32 s6, s4, 15 3196; GFX7-NEXT: s_and_b32 s7, s5, 15 3197; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40004 3198; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40008 3199; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c 3200; GFX7-NEXT: s_bfe_u32 s14, s4, 0x40010 3201; GFX7-NEXT: s_bfe_u32 s16, s4, 0x40014 3202; GFX7-NEXT: s_bfe_u32 s18, s4, 0x40018 3203; GFX7-NEXT: s_lshr_b32 s4, s4, 28 3204; GFX7-NEXT: v_mov_b32_e32 v0, s6 3205; GFX7-NEXT: v_mov_b32_e32 v1, s20 3206; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 3207; GFX7-NEXT: s_bfe_u32 s9, s5, 0x40004 3208; GFX7-NEXT: s_bfe_u32 s11, s5, 0x40008 3209; GFX7-NEXT: s_bfe_u32 s13, s5, 0x4000c 3210; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40010 3211; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40014 3212; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40018 3213; GFX7-NEXT: s_lshr_b32 s5, s5, 28 3214; GFX7-NEXT: v_mov_b32_e32 v1, s4 3215; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 3216; GFX7-NEXT: v_mov_b32_e32 v1, s8 3217; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 3218; GFX7-NEXT: v_mov_b32_e32 v1, s10 3219; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 3220; GFX7-NEXT: v_mov_b32_e32 v1, s12 3221; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 3222; GFX7-NEXT: v_mov_b32_e32 v1, s14 3223; GFX7-NEXT: v_mad_u32_u24 v0, s15, v1, v0 3224; GFX7-NEXT: v_mov_b32_e32 v1, s16 3225; GFX7-NEXT: v_mad_u32_u24 v0, s17, v1, v0 3226; GFX7-NEXT: v_mov_b32_e32 v1, s18 3227; GFX7-NEXT: v_mad_u32_u24 v0, s19, v1, v0 3228; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 3229; GFX7-NEXT: s_endpgm 3230; 3231; GFX8-LABEL: udot8_variant1: 3232; GFX8: ; %bb.0: ; %entry 3233; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3234; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3236; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 3237; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 3238; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 3239; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3240; GFX8-NEXT: s_and_b32 s4, s2, 15 3241; GFX8-NEXT: s_and_b32 s5, s3, 15 3242; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40004 3243; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40008 3244; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c 3245; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40010 3246; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40014 3247; GFX8-NEXT: s_bfe_u32 s16, s2, 0x40018 3248; GFX8-NEXT: s_lshr_b32 s2, s2, 28 3249; GFX8-NEXT: v_mov_b32_e32 v0, s4 3250; GFX8-NEXT: v_mov_b32_e32 v1, s18 3251; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 3252; GFX8-NEXT: s_bfe_u32 s7, s3, 0x40004 3253; GFX8-NEXT: s_bfe_u32 s9, s3, 0x40008 3254; GFX8-NEXT: s_bfe_u32 s11, s3, 0x4000c 3255; GFX8-NEXT: s_bfe_u32 s13, s3, 0x40010 3256; GFX8-NEXT: s_bfe_u32 s15, s3, 0x40014 3257; GFX8-NEXT: s_bfe_u32 s17, s3, 0x40018 3258; GFX8-NEXT: s_lshr_b32 s3, s3, 28 3259; GFX8-NEXT: v_mov_b32_e32 v1, s2 3260; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 3261; GFX8-NEXT: v_mov_b32_e32 v1, s6 3262; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 3263; GFX8-NEXT: v_mov_b32_e32 v1, s8 3264; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 3265; GFX8-NEXT: v_mov_b32_e32 v1, s10 3266; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 3267; GFX8-NEXT: v_mov_b32_e32 v1, s12 3268; GFX8-NEXT: v_mad_u32_u24 v0, s13, v1, v0 3269; GFX8-NEXT: v_mov_b32_e32 v1, s14 3270; GFX8-NEXT: v_mad_u32_u24 v0, s15, v1, v0 3271; GFX8-NEXT: v_mov_b32_e32 v1, s16 3272; GFX8-NEXT: v_mad_u32_u24 v2, s17, v1, v0 3273; GFX8-NEXT: v_mov_b32_e32 v0, s0 3274; GFX8-NEXT: v_mov_b32_e32 v1, s1 3275; GFX8-NEXT: flat_store_dword v[0:1], v2 3276; GFX8-NEXT: s_endpgm 3277; 3278; GFX9-LABEL: udot8_variant1: 3279; GFX9: ; %bb.0: ; %entry 3280; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3281; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3282; GFX9-NEXT: v_mov_b32_e32 v0, 0 3283; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3284; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 3285; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 3286; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 3287; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3288; GFX9-NEXT: s_and_b32 s4, s2, 15 3289; GFX9-NEXT: s_and_b32 s5, s3, 15 3290; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 3291; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 3292; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c 3293; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40010 3294; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014 3295; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018 3296; GFX9-NEXT: s_lshr_b32 s2, s2, 28 3297; GFX9-NEXT: v_mov_b32_e32 v1, s4 3298; GFX9-NEXT: v_mov_b32_e32 v2, s18 3299; GFX9-NEXT: v_mad_u32_u24 v1, s5, v1, v2 3300; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004 3301; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008 3302; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c 3303; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40010 3304; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014 3305; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018 3306; GFX9-NEXT: s_lshr_b32 s3, s3, 28 3307; GFX9-NEXT: v_mov_b32_e32 v2, s2 3308; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 3309; GFX9-NEXT: v_mov_b32_e32 v2, s6 3310; GFX9-NEXT: v_mad_u32_u24 v1, s7, v2, v1 3311; GFX9-NEXT: v_mov_b32_e32 v2, s8 3312; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 3313; GFX9-NEXT: v_mov_b32_e32 v2, s10 3314; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 3315; GFX9-NEXT: v_mov_b32_e32 v2, s12 3316; GFX9-NEXT: v_mad_u32_u24 v1, s13, v2, v1 3317; GFX9-NEXT: v_mov_b32_e32 v2, s14 3318; GFX9-NEXT: v_mad_u32_u24 v1, s15, v2, v1 3319; GFX9-NEXT: v_mov_b32_e32 v2, s16 3320; GFX9-NEXT: v_mad_u32_u24 v1, s17, v2, v1 3321; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3322; GFX9-NEXT: s_endpgm 3323; 3324; GFX9-DL-LABEL: udot8_variant1: 3325; GFX9-DL: ; %bb.0: ; %entry 3326; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3327; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3328; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3329; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3330; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 3331; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 3332; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 3333; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3334; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 3335; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 3336; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 3337; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 3338; GFX9-DL-NEXT: s_endpgm 3339; 3340; GFX10-DL-LABEL: udot8_variant1: 3341; GFX10-DL: ; %bb.0: ; %entry 3342; GFX10-DL-NEXT: s_clause 0x1 3343; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 3344; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3345; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 3346; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3347; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 3348; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 3349; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 3350; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3351; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 3352; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s1, s0, v0 3353; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 3354; GFX10-DL-NEXT: s_endpgm 3355 i32 addrspace(1)* %v2addr, 3356 i32 addrspace(1)* %dst) { 3357entry: 3358 %v1 = load i32, i32 addrspace(1)* %v1addr, align 4 3359 %v2 = load i32, i32 addrspace(1)* %v2addr, align 4 3360 %and = and i32 %v1, 15 3361 %and1 = and i32 %v2, 15 3362 %mul1 = mul nuw nsw i32 %and1, %and 3363 3364 %shr = lshr i32 %v1, 4 3365 %and2 = and i32 %shr, 15 3366 %shr3 = lshr i32 %v2, 4 3367 %and4 = and i32 %shr3, 15 3368 %mul2 = mul nuw nsw i32 %and4, %and2 3369 3370 %shr6 = lshr i32 %v1, 8 3371 %and7 = and i32 %shr6, 15 3372 %shr8 = lshr i32 %v2, 8 3373 %and9 = and i32 %shr8, 15 3374 %mul3 = mul nuw nsw i32 %and9, %and7 3375 3376 %shr12 = lshr i32 %v1, 12 3377 %and13 = and i32 %shr12, 15 3378 %shr14 = lshr i32 %v2, 12 3379 %and15 = and i32 %shr14, 15 3380 %mul4 = mul nuw nsw i32 %and15, %and13 3381 3382 %shr18 = lshr i32 %v1, 16 3383 %and19 = and i32 %shr18, 15 3384 %shr20 = lshr i32 %v2, 16 3385 %and21 = and i32 %shr20, 15 3386 %mul5 = mul nuw nsw i32 %and21, %and19 3387 3388 %shr24 = lshr i32 %v1, 20 3389 %and25 = and i32 %shr24, 15 3390 %shr26 = lshr i32 %v2, 20 3391 %and27 = and i32 %shr26, 15 3392 %mul6 = mul nuw nsw i32 %and27, %and25 3393 3394 %shr30 = lshr i32 %v1, 24 3395 %and31 = and i32 %shr30, 15 3396 %shr32 = lshr i32 %v2, 24 3397 %and33 = and i32 %shr32, 15 3398 %mul7 = mul nuw nsw i32 %and33, %and31 3399 3400 %shr36 = lshr i32 %v1, 28 3401 %shr37 = lshr i32 %v2, 28 3402 %mul8 = mul nuw nsw i32 %shr37, %shr36 3403 %acc = load i32, i32 addrspace(1)* %dst, align 4 3404 3405 %add1 = add i32 %mul1, %acc 3406 %add2 = add i32 %add1, %mul8 3407 %add3 = add i32 %add2, %mul2 3408 %add4 = add i32 %add3, %mul3 3409 %add5 = add i32 %add4, %mul4 3410 %add6 = add i32 %add5, %mul5 3411 %add7 = add i32 %add6, %mul6 3412 %add8 = add i32 %add7, %mul7 3413 store i32 %add8, i32 addrspace(1)* %dst, align 4 3414 ret void 3415} 3416