1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 8 9define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1, 10; GFX7-LABEL: udot4_acc32: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 13; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 14; GFX7-NEXT: s_movk_i32 s8, 0xff 15; GFX7-NEXT: s_mov_b32 s3, 0xf000 16; GFX7-NEXT: s_mov_b32 s2, -1 17; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 19; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 20; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 21; GFX7-NEXT: s_waitcnt lgkmcnt(0) 22; GFX7-NEXT: s_and_b32 s6, s4, s8 23; GFX7-NEXT: s_and_b32 s7, s5, s8 24; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 25; GFX7-NEXT: v_mov_b32_e32 v0, s7 26; GFX7-NEXT: v_mov_b32_e32 v1, s12 27; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 28; GFX7-NEXT: v_mad_u32_u24 v0, s6, v0, v1 29; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 30; GFX7-NEXT: v_mov_b32_e32 v1, s9 31; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 32; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 33; GFX7-NEXT: v_mov_b32_e32 v1, s11 34; GFX7-NEXT: s_lshr_b32 s5, s5, 24 35; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 36; GFX7-NEXT: s_lshr_b32 s4, s4, 24 37; GFX7-NEXT: v_mov_b32_e32 v1, s5 38; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 39; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 40; GFX7-NEXT: s_endpgm 41; 42; GFX8-LABEL: udot4_acc32: 43; GFX8: ; %bb.0: ; %entry 44; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 45; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 46; GFX8-NEXT: s_movk_i32 s2, 0xff 47; GFX8-NEXT: s_waitcnt lgkmcnt(0) 48; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 49; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 50; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 51; GFX8-NEXT: s_waitcnt lgkmcnt(0) 52; GFX8-NEXT: s_and_b32 s5, s3, s2 53; GFX8-NEXT: s_and_b32 s2, s4, s2 54; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 55; GFX8-NEXT: v_mov_b32_e32 v0, s2 56; GFX8-NEXT: v_mov_b32_e32 v1, s10 57; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 58; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 59; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 60; GFX8-NEXT: v_mov_b32_e32 v1, s7 61; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 62; GFX8-NEXT: v_mad_u32_u24 v0, s6, v1, v0 63; GFX8-NEXT: v_mov_b32_e32 v1, s9 64; GFX8-NEXT: s_lshr_b32 s4, s4, 24 65; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 66; GFX8-NEXT: s_lshr_b32 s3, s3, 24 67; GFX8-NEXT: v_mov_b32_e32 v1, s4 68; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 69; GFX8-NEXT: v_mov_b32_e32 v0, s0 70; GFX8-NEXT: v_mov_b32_e32 v1, s1 71; GFX8-NEXT: flat_store_dword v[0:1], v2 72; GFX8-NEXT: s_endpgm 73; 74; GFX9-NODL-LABEL: udot4_acc32: 75; GFX9-NODL: ; %bb.0: ; %entry 76; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 77; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 78; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 79; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 80; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 81; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 82; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 83; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 84; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 85; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 86; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 87; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 88; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 89; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 90; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 91; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 92; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 93; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 94; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 95; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 96; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 97; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 98; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 99; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 100; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 101; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 102; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 103; GFX9-NODL-NEXT: s_endpgm 104; 105; GFX9-DL-LABEL: udot4_acc32: 106; GFX9-DL: ; %bb.0: ; %entry 107; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 108; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 109; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 110; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 112; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 113; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 114; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 115; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 116; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 117; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s4, v1, v2 118; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 119; GFX9-DL-NEXT: s_endpgm 120; 121; GFX10-DL-LABEL: udot4_acc32: 122; GFX10-DL: ; %bb.0: ; %entry 123; GFX10-DL-NEXT: s_clause 0x1 124; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 125; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 126; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 127; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 128; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 129; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 130; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 131; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 132; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 133; GFX10-DL-NEXT: v_dot4_u32_u8 v0, s0, s1, v0 134; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 135; GFX10-DL-NEXT: s_endpgm 136 <4 x i8> addrspace(1)* %src2, 137 i32 addrspace(1)* nocapture %dst) { 138entry: 139 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 140 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 141 142 %v1e0 = extractelement <4 x i8> %vec1, i64 0 143 %cv1e0 = zext i8 %v1e0 to i32 144 %v2e0 = extractelement <4 x i8> %vec2, i64 0 145 %cv2e0 = zext i8 %v2e0 to i32 146 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 147 148 %v1e1 = extractelement <4 x i8> %vec1, i64 1 149 %cv1e1 = zext i8 %v1e1 to i32 150 %v2e1 = extractelement <4 x i8> %vec2, i64 1 151 %cv2e1 = zext i8 %v2e1 to i32 152 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 153 154 %v1e2 = extractelement <4 x i8> %vec1, i64 2 155 %cv1e2 = zext i8 %v1e2 to i32 156 %v2e2 = extractelement <4 x i8> %vec2, i64 2 157 %cv2e2 = zext i8 %v2e2 to i32 158 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 159 160 %v1e3 = extractelement <4 x i8> %vec1, i64 3 161 %cv1e3 = zext i8 %v1e3 to i32 162 %v2e3 = extractelement <4 x i8> %vec2, i64 3 163 %cv2e3 = zext i8 %v2e3 to i32 164 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 165 166 %acc = load i32, i32 addrspace(1)* %dst, align 4 167 %mad1 = add i32 %mul1, %acc 168 %mad2 = add i32 %mad1, %mul2 169 %mad3 = add i32 %mad2, %mul3 170 %mad4 = add i32 %mad3, %mul4 171 172 store i32 %mad4, i32 addrspace(1)* %dst, align 4 173 ret void 174} 175 176define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, 177; GFX7-LABEL: udot4_acc16: 178; GFX7: ; %bb.0: ; %entry 179; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 180; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 181; GFX7-NEXT: s_mov_b32 s3, 0xf000 182; GFX7-NEXT: s_mov_b32 s2, -1 183; GFX7-NEXT: s_movk_i32 s8, 0xff 184; GFX7-NEXT: s_waitcnt lgkmcnt(0) 185; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 186; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 187; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 188; GFX7-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7-NEXT: s_and_b32 s7, s4, s8 190; GFX7-NEXT: s_and_b32 s6, s5, s8 191; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 192; GFX7-NEXT: v_mov_b32_e32 v1, s6 193; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 194; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 195; GFX7-NEXT: v_mov_b32_e32 v2, s8 196; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 197; GFX7-NEXT: s_lshr_b32 s5, s5, 24 198; GFX7-NEXT: v_mov_b32_e32 v3, s10 199; GFX7-NEXT: s_lshr_b32 s4, s4, 24 200; GFX7-NEXT: s_waitcnt vmcnt(0) 201; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 202; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 203; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 204; GFX7-NEXT: v_mov_b32_e32 v1, s5 205; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 206; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 207; GFX7-NEXT: s_endpgm 208; 209; GFX8-LABEL: udot4_acc16: 210; GFX8: ; %bb.0: ; %entry 211; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 212; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 213; GFX8-NEXT: s_waitcnt lgkmcnt(0) 214; GFX8-NEXT: v_mov_b32_e32 v0, s0 215; GFX8-NEXT: v_mov_b32_e32 v1, s1 216; GFX8-NEXT: flat_load_ushort v2, v[0:1] 217; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 218; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 219; GFX8-NEXT: s_movk_i32 s0, 0xff 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_and_b32 s3, s1, s0 222; GFX8-NEXT: s_and_b32 s0, s2, s0 223; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 224; GFX8-NEXT: v_mov_b32_e32 v3, s0 225; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 226; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 227; GFX8-NEXT: v_mov_b32_e32 v4, s5 228; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 229; GFX8-NEXT: s_lshr_b32 s2, s2, 24 230; GFX8-NEXT: v_mov_b32_e32 v5, s7 231; GFX8-NEXT: s_lshr_b32 s1, s1, 24 232; GFX8-NEXT: s_waitcnt vmcnt(0) 233; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 234; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2 235; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 236; GFX8-NEXT: v_mov_b32_e32 v3, s2 237; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 238; GFX8-NEXT: flat_store_short v[0:1], v2 239; GFX8-NEXT: s_endpgm 240; 241; GFX9-NODL-LABEL: udot4_acc16: 242; GFX9-NODL: ; %bb.0: ; %entry 243; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 244; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 245; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 246; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 247; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 248; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] 249; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 250; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 251; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 252; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 253; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 254; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 255; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 256; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 257; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 258; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 259; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 260; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 261; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 262; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 263; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 264; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 265; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 266; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 267; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 268; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 269; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] 270; GFX9-NODL-NEXT: s_endpgm 271; 272; GFX9-DL-LABEL: udot4_acc16: 273; GFX9-DL: ; %bb.0: ; %entry 274; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 275; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 276; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 277; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 278; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 279; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 280; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] 281; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 282; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 283; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 284; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 285; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 286; GFX9-DL-NEXT: s_endpgm 287; 288; GFX10-DL-LABEL: udot4_acc16: 289; GFX10-DL: ; %bb.0: ; %entry 290; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 291; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 292; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 293; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 294; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 295; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 296; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 297; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 298; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 299; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 300; GFX10-DL-NEXT: s_endpgm 301 <4 x i8> addrspace(1)* %src2, 302 i16 addrspace(1)* nocapture %dst) { 303entry: 304 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 305 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 306 307 %v1e0 = extractelement <4 x i8> %vec1, i64 0 308 %cv1e0 = zext i8 %v1e0 to i16 309 %v2e0 = extractelement <4 x i8> %vec2, i64 0 310 %cv2e0 = zext i8 %v2e0 to i16 311 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0 312 313 %v1e1 = extractelement <4 x i8> %vec1, i64 1 314 %cv1e1 = zext i8 %v1e1 to i16 315 %v2e1 = extractelement <4 x i8> %vec2, i64 1 316 %cv2e1 = zext i8 %v2e1 to i16 317 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1 318 319 %v1e2 = extractelement <4 x i8> %vec1, i64 2 320 %cv1e2 = zext i8 %v1e2 to i16 321 %v2e2 = extractelement <4 x i8> %vec2, i64 2 322 %cv2e2 = zext i8 %v2e2 to i16 323 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2 324 325 %v1e3 = extractelement <4 x i8> %vec1, i64 3 326 %cv1e3 = zext i8 %v1e3 to i16 327 %v2e3 = extractelement <4 x i8> %vec2, i64 3 328 %cv2e3 = zext i8 %v2e3 to i16 329 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3 330 331 %acc = load i16, i16 addrspace(1)* %dst, align 2 332 %mad1 = add i16 %mul1, %acc 333 %mad2 = add i16 %mad1, %mul2 334 %mad3 = add i16 %mad2, %mul3 335 %mad4 = add i16 %mad3, %mul4 336 337 store i16 %mad4, i16 addrspace(1)* %dst, align 2 338 ret void 339} 340 341define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, 342; GFX7-LABEL: udot4_acc8: 343; GFX7: ; %bb.0: ; %entry 344; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 345; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 346; GFX7-NEXT: s_mov_b32 s3, 0xf000 347; GFX7-NEXT: s_mov_b32 s2, -1 348; GFX7-NEXT: s_movk_i32 s8, 0xff 349; GFX7-NEXT: s_waitcnt lgkmcnt(0) 350; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 351; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 352; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 353; GFX7-NEXT: s_waitcnt lgkmcnt(0) 354; GFX7-NEXT: s_and_b32 s7, s4, s8 355; GFX7-NEXT: s_and_b32 s6, s5, s8 356; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 357; GFX7-NEXT: v_mov_b32_e32 v1, s6 358; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 359; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 360; GFX7-NEXT: v_mov_b32_e32 v2, s8 361; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 362; GFX7-NEXT: s_lshr_b32 s5, s5, 24 363; GFX7-NEXT: v_mov_b32_e32 v3, s10 364; GFX7-NEXT: s_lshr_b32 s4, s4, 24 365; GFX7-NEXT: s_waitcnt vmcnt(0) 366; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 367; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 368; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 369; GFX7-NEXT: v_mov_b32_e32 v1, s5 370; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 371; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 372; GFX7-NEXT: s_endpgm 373; 374; GFX8-LABEL: udot4_acc8: 375; GFX8: ; %bb.0: ; %entry 376; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 377; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 378; GFX8-NEXT: s_waitcnt lgkmcnt(0) 379; GFX8-NEXT: v_mov_b32_e32 v0, s0 380; GFX8-NEXT: v_mov_b32_e32 v1, s1 381; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 382; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 383; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 384; GFX8-NEXT: s_movk_i32 s0, 0xff 385; GFX8-NEXT: s_waitcnt lgkmcnt(0) 386; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 387; GFX8-NEXT: s_and_b32 s3, s2, s0 388; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 389; GFX8-NEXT: s_and_b32 s0, s1, s0 390; GFX8-NEXT: v_mov_b32_e32 v3, s3 391; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 392; GFX8-NEXT: v_mov_b32_e32 v4, s4 393; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 394; GFX8-NEXT: s_lshr_b32 s2, s2, 24 395; GFX8-NEXT: v_mov_b32_e32 v5, s6 396; GFX8-NEXT: s_lshr_b32 s1, s1, 24 397; GFX8-NEXT: s_waitcnt vmcnt(0) 398; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 399; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 400; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 401; GFX8-NEXT: v_mov_b32_e32 v3, s2 402; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 403; GFX8-NEXT: flat_store_byte v[0:1], v2 404; GFX8-NEXT: s_endpgm 405; 406; GFX9-NODL-LABEL: udot4_acc8: 407; GFX9-NODL: ; %bb.0: ; %entry 408; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 409; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 410; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 411; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 412; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 413; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] 414; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 415; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 416; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 417; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 418; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 419; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008 420; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 421; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 422; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 423; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 424; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 425; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 426; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 427; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 428; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 429; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 430; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 431; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 432; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 433; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 434; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] 435; GFX9-NODL-NEXT: s_endpgm 436; 437; GFX9-DL-LABEL: udot4_acc8: 438; GFX9-DL: ; %bb.0: ; %entry 439; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 440; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 441; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 442; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 443; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 444; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 445; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 446; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 448; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 449; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 450; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 451; GFX9-DL-NEXT: s_endpgm 452; 453; GFX10-DL-LABEL: udot4_acc8: 454; GFX10-DL: ; %bb.0: ; %entry 455; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 456; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 457; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 458; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 459; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 460; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 461; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 462; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 463; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 464; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 465; GFX10-DL-NEXT: s_endpgm 466 <4 x i8> addrspace(1)* %src2, 467 i8 addrspace(1)* nocapture %dst) { 468entry: 469 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 470 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 471 472 %v1e0 = extractelement <4 x i8> %vec1, i64 0 473 %v2e0 = extractelement <4 x i8> %vec2, i64 0 474 %mul1 = mul nuw nsw i8 %v1e0, %v2e0 475 476 %v1e1 = extractelement <4 x i8> %vec1, i64 1 477 %v2e1 = extractelement <4 x i8> %vec2, i64 1 478 %mul2 = mul nuw nsw i8 %v1e1, %v2e1 479 480 %v1e2 = extractelement <4 x i8> %vec1, i64 2 481 %v2e2 = extractelement <4 x i8> %vec2, i64 2 482 %mul3 = mul nuw nsw i8 %v1e2, %v2e2 483 484 %v1e3 = extractelement <4 x i8> %vec1, i64 3 485 %v2e3 = extractelement <4 x i8> %vec2, i64 3 486 %mul4 = mul nuw nsw i8 %v1e3, %v2e3 487 488 %acc = load i8, i8 addrspace(1)* %dst, align 2 489 %mad1 = add i8 %mul1, %acc 490 %mad2 = add i8 %mad1, %mul2 491 %mad3 = add i8 %mad2, %mul3 492 %mad4 = add i8 %mad3, %mul4 493 494 store i8 %mad4, i8 addrspace(1)* %dst, align 2 495 ret void 496} 497 498; TODO: Generate udot4? 499define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, 500; GFX7-LABEL: udot2_8: 501; GFX7: ; %bb.0: ; %entry 502; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 503; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 504; GFX7-NEXT: s_mov_b32 s3, 0xf000 505; GFX7-NEXT: s_mov_b32 s2, -1 506; GFX7-NEXT: s_movk_i32 s8, 0xff 507; GFX7-NEXT: s_waitcnt lgkmcnt(0) 508; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 509; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 510; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 511; GFX7-NEXT: s_waitcnt lgkmcnt(0) 512; GFX7-NEXT: s_and_b32 s7, s4, s8 513; GFX7-NEXT: s_and_b32 s6, s5, s8 514; GFX7-NEXT: v_mov_b32_e32 v1, s6 515; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80008 516; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80008 517; GFX7-NEXT: s_waitcnt vmcnt(0) 518; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 519; GFX7-NEXT: v_mov_b32_e32 v1, s5 520; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 521; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 522; GFX7-NEXT: s_endpgm 523; 524; GFX8-LABEL: udot2_8: 525; GFX8: ; %bb.0: ; %entry 526; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 527; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 528; GFX8-NEXT: s_waitcnt lgkmcnt(0) 529; GFX8-NEXT: v_mov_b32_e32 v0, s0 530; GFX8-NEXT: v_mov_b32_e32 v1, s1 531; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 532; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 533; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 534; GFX8-NEXT: s_movk_i32 s0, 0xff 535; GFX8-NEXT: s_waitcnt lgkmcnt(0) 536; GFX8-NEXT: s_and_b32 s3, s2, s0 537; GFX8-NEXT: s_and_b32 s0, s1, s0 538; GFX8-NEXT: v_mov_b32_e32 v3, s3 539; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80008 540; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 541; GFX8-NEXT: s_waitcnt vmcnt(0) 542; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 543; GFX8-NEXT: v_mov_b32_e32 v3, s2 544; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 545; GFX8-NEXT: flat_store_byte v[0:1], v2 546; GFX8-NEXT: s_endpgm 547; 548; GFX9-NODL-LABEL: udot2_8: 549; GFX9-NODL: ; %bb.0: ; %entry 550; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 551; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 552; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 553; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 554; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 555; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] 556; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 557; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 558; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 559; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 560; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 561; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 562; GFX9-NODL-NEXT: s_bfe_u32 s4, s4, 0x80008 563; GFX9-NODL-NEXT: s_bfe_u32 s3, s3, 0x80008 564; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 565; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 566; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 567; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 568; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] 569; GFX9-NODL-NEXT: s_endpgm 570; 571; GFX9-DL-LABEL: udot2_8: 572; GFX9-DL: ; %bb.0: ; %entry 573; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 574; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 575; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 576; GFX9-DL-NEXT: s_movk_i32 s2, 0xff 577; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 578; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 579; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 580; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 581; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 582; GFX9-DL-NEXT: s_and_b32 s5, s4, s2 583; GFX9-DL-NEXT: s_and_b32 s2, s3, s2 584; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 585; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x80008 586; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x80008 587; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 588; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 589; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 590; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 591; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 592; GFX9-DL-NEXT: s_endpgm 593; 594; GFX10-DL-LABEL: udot2_8: 595; GFX10-DL: ; %bb.0: ; %entry 596; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 597; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 598; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 599; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 600; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 601; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 602; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 603; GFX10-DL-NEXT: s_movk_i32 s1, 0xff 604; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 605; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 606; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 607; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 608; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 609; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s3, v1 610; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 611; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 612; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 613; GFX10-DL-NEXT: s_endpgm 614 <4 x i8> addrspace(1)* %src2, 615 i8 addrspace(1)* nocapture %dst) { 616entry: 617 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 618 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 619 620 %v1e0 = extractelement <4 x i8> %vec1, i64 0 621 %v2e0 = extractelement <4 x i8> %vec2, i64 0 622 %mul1 = mul nuw nsw i8 %v1e0, %v2e0 623 624 %v1e1 = extractelement <4 x i8> %vec1, i64 1 625 %v2e1 = extractelement <4 x i8> %vec2, i64 1 626 %mul2 = mul nuw nsw i8 %v1e1, %v2e1 627 628 %acc = load i8, i8 addrspace(1)* %dst, align 2 629 %mad1 = add i8 %mul1, %acc 630 %mad2 = add i8 %mad1, %mul2 631 store i8 %mad2, i8 addrspace(1)* %dst, align 2 632 ret void 633} 634 635define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1, 636; GFX7-LABEL: udot4_CommutationInsideMAD: 637; GFX7: ; %bb.0: ; %entry 638; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 639; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 640; GFX7-NEXT: s_mov_b32 s3, 0xf000 641; GFX7-NEXT: s_mov_b32 s2, -1 642; GFX7-NEXT: s_movk_i32 s8, 0xff 643; GFX7-NEXT: s_waitcnt lgkmcnt(0) 644; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 645; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 646; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 647; GFX7-NEXT: s_waitcnt lgkmcnt(0) 648; GFX7-NEXT: s_and_b32 s6, s4, s8 649; GFX7-NEXT: s_and_b32 s7, s5, s8 650; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 651; GFX7-NEXT: v_mov_b32_e32 v1, s6 652; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 653; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 654; GFX7-NEXT: v_mov_b32_e32 v2, s8 655; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 656; GFX7-NEXT: s_lshr_b32 s4, s4, 24 657; GFX7-NEXT: v_mov_b32_e32 v3, s10 658; GFX7-NEXT: s_lshr_b32 s5, s5, 24 659; GFX7-NEXT: s_waitcnt vmcnt(0) 660; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 661; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 662; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 663; GFX7-NEXT: v_mov_b32_e32 v1, s4 664; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 665; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 666; GFX7-NEXT: s_endpgm 667; 668; GFX8-LABEL: udot4_CommutationInsideMAD: 669; GFX8: ; %bb.0: ; %entry 670; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 671; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 672; GFX8-NEXT: s_waitcnt lgkmcnt(0) 673; GFX8-NEXT: v_mov_b32_e32 v0, s0 674; GFX8-NEXT: v_mov_b32_e32 v1, s1 675; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 676; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 677; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 678; GFX8-NEXT: s_movk_i32 s0, 0xff 679; GFX8-NEXT: s_waitcnt lgkmcnt(0) 680; GFX8-NEXT: s_and_b32 s3, s1, s0 681; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 682; GFX8-NEXT: s_and_b32 s0, s2, s0 683; GFX8-NEXT: v_mov_b32_e32 v3, s3 684; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 685; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 686; GFX8-NEXT: v_mov_b32_e32 v4, s4 687; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 688; GFX8-NEXT: s_lshr_b32 s1, s1, 24 689; GFX8-NEXT: v_mov_b32_e32 v5, s6 690; GFX8-NEXT: s_lshr_b32 s2, s2, 24 691; GFX8-NEXT: s_waitcnt vmcnt(0) 692; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 693; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 694; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 695; GFX8-NEXT: v_mov_b32_e32 v3, s1 696; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 697; GFX8-NEXT: flat_store_byte v[0:1], v2 698; GFX8-NEXT: s_endpgm 699; 700; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: 701; GFX9-NODL: ; %bb.0: ; %entry 702; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 703; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 704; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 705; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 706; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 707; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] 708; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 709; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 710; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 711; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 712; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 713; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 714; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 715; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 716; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 717; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 718; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 719; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 720; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 721; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 722; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 723; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 724; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 725; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 726; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 727; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 728; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] 729; GFX9-NODL-NEXT: s_endpgm 730; 731; GFX9-DL-LABEL: udot4_CommutationInsideMAD: 732; GFX9-DL: ; %bb.0: ; %entry 733; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 734; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 735; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 736; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 737; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 738; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 739; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 740; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 741; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 742; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 743; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s3, v2, v1 744; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 745; GFX9-DL-NEXT: s_endpgm 746; 747; GFX10-DL-LABEL: udot4_CommutationInsideMAD: 748; GFX10-DL: ; %bb.0: ; %entry 749; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 750; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 751; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 752; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 753; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 754; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 755; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 756; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 757; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s1, s0, v1 758; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 759; GFX10-DL-NEXT: s_endpgm 760 <4 x i8> addrspace(1)* %src2, 761 i8 addrspace(1)* nocapture %dst) { 762entry: 763 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 764 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 765 766 %v1e0 = extractelement <4 x i8> %vec1, i64 0 767 %v2e0 = extractelement <4 x i8> %vec2, i64 0 768 %mul1 = mul nuw nsw i8 %v2e0, %v1e0 769 770 %v1e1 = extractelement <4 x i8> %vec1, i64 1 771 %v2e1 = extractelement <4 x i8> %vec2, i64 1 772 %mul2 = mul nuw nsw i8 %v2e1, %v1e1 773 774 %v1e2 = extractelement <4 x i8> %vec1, i64 2 775 %v2e2 = extractelement <4 x i8> %vec2, i64 2 776 %mul3 = mul nuw nsw i8 %v2e2, %v1e2 777 778 %v1e3 = extractelement <4 x i8> %vec1, i64 3 779 %v2e3 = extractelement <4 x i8> %vec2, i64 3 780 %mul4 = mul nuw nsw i8 %v2e3, %v1e3 781 782 %acc = load i8, i8 addrspace(1)* %dst, align 2 783 %mad1 = add i8 %acc, %mul1 784 %mad2 = add i8 %mul2, %mad1 785 %mad3 = add i8 %mul3, %mad2 786 %mad4 = add i8 %mul4, %mad3 787 788 store i8 %mad4, i8 addrspace(1)* %dst, align 2 789 ret void 790} 791 792; TODO: Support commutation accross the adds. 793define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1, 794; GFX7-LABEL: udot4_CommutationAccrossMADs: 795; GFX7: ; %bb.0: ; %entry 796; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 797; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 798; GFX7-NEXT: s_mov_b32 s3, 0xf000 799; GFX7-NEXT: s_mov_b32 s2, -1 800; GFX7-NEXT: s_movk_i32 s8, 0xff 801; GFX7-NEXT: s_waitcnt lgkmcnt(0) 802; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 803; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 804; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 805; GFX7-NEXT: s_waitcnt lgkmcnt(0) 806; GFX7-NEXT: s_and_b32 s6, s4, s8 807; GFX7-NEXT: s_and_b32 s7, s5, s8 808; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 809; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 810; GFX7-NEXT: v_mov_b32_e32 v1, s8 811; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 812; GFX7-NEXT: v_mov_b32_e32 v2, s6 813; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 814; GFX7-NEXT: s_lshr_b32 s4, s4, 24 815; GFX7-NEXT: v_mov_b32_e32 v3, s10 816; GFX7-NEXT: s_lshr_b32 s5, s5, 24 817; GFX7-NEXT: s_waitcnt vmcnt(0) 818; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 819; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 820; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 821; GFX7-NEXT: v_mov_b32_e32 v1, s4 822; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 823; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 824; GFX7-NEXT: s_endpgm 825; 826; GFX8-LABEL: udot4_CommutationAccrossMADs: 827; GFX8: ; %bb.0: ; %entry 828; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 829; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 830; GFX8-NEXT: s_waitcnt lgkmcnt(0) 831; GFX8-NEXT: v_mov_b32_e32 v0, s0 832; GFX8-NEXT: v_mov_b32_e32 v1, s1 833; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 834; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 835; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 836; GFX8-NEXT: s_movk_i32 s0, 0xff 837; GFX8-NEXT: s_waitcnt lgkmcnt(0) 838; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 839; GFX8-NEXT: s_and_b32 s3, s1, s0 840; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 841; GFX8-NEXT: v_mov_b32_e32 v3, s4 842; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 843; GFX8-NEXT: s_and_b32 s0, s2, s0 844; GFX8-NEXT: v_mov_b32_e32 v4, s3 845; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 846; GFX8-NEXT: s_lshr_b32 s1, s1, 24 847; GFX8-NEXT: v_mov_b32_e32 v5, s6 848; GFX8-NEXT: s_lshr_b32 s2, s2, 24 849; GFX8-NEXT: s_waitcnt vmcnt(0) 850; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2 851; GFX8-NEXT: v_mad_u32_u24 v2, s0, v4, v2 852; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 853; GFX8-NEXT: v_mov_b32_e32 v3, s1 854; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 855; GFX8-NEXT: flat_store_byte v[0:1], v2 856; GFX8-NEXT: s_endpgm 857; 858; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: 859; GFX9-NODL: ; %bb.0: ; %entry 860; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 861; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 862; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 863; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 864; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 865; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] 866; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 867; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 868; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 869; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 870; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 871; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 872; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 873; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 874; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 875; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 876; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 877; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 878; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 879; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 880; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 881; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 882; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v3, v1 883; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 884; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 885; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 886; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] 887; GFX9-NODL-NEXT: s_endpgm 888; 889; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: 890; GFX9-DL: ; %bb.0: ; %entry 891; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 892; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 893; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 894; GFX9-DL-NEXT: s_movk_i32 s2, 0xff 895; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 896; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 897; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 898; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 899; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 900; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 901; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 902; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 903; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 904; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 905; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 906; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 907; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 908; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 909; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 910; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 911; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 912; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 913; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v3, v1 914; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 915; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 916; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 917; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 918; GFX9-DL-NEXT: s_endpgm 919; 920; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: 921; GFX10-DL: ; %bb.0: ; %entry 922; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 923; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 924; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 925; GFX10-DL-NEXT: s_movk_i32 s6, 0xff 926; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 927; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 928; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 929; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 930; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 931; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 932; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 933; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 934; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 935; GFX10-DL-NEXT: s_and_b32 s2, s0, s6 936; GFX10-DL-NEXT: s_and_b32 s3, s1, s6 937; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 938; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 939; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 940; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 941; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 942; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 943; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 944; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 945; GFX10-DL-NEXT: s_endpgm 946 <4 x i8> addrspace(1)* %src2, 947 i8 addrspace(1)* nocapture %dst) { 948entry: 949 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 950 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 951 952 %v1e0 = extractelement <4 x i8> %vec1, i64 0 953 %v2e0 = extractelement <4 x i8> %vec2, i64 0 954 %mul1 = mul nuw nsw i8 %v2e0, %v1e0 955 956 %v1e1 = extractelement <4 x i8> %vec1, i64 1 957 %v2e1 = extractelement <4 x i8> %vec2, i64 1 958 %mul2 = mul nuw nsw i8 %v2e1, %v1e1 959 960 %v1e2 = extractelement <4 x i8> %vec1, i64 2 961 %v2e2 = extractelement <4 x i8> %vec2, i64 2 962 %mul3 = mul nuw nsw i8 %v2e2, %v1e2 963 964 %v1e3 = extractelement <4 x i8> %vec1, i64 3 965 %v2e3 = extractelement <4 x i8> %vec2, i64 3 966 %mul4 = mul nuw nsw i8 %v2e3, %v1e3 967 968 %acc = load i8, i8 addrspace(1)* %dst, align 2 969 %mad1 = add i8 %acc, %mul2 970 %mad2 = add i8 %mad1, %mul1 971 %mad3 = add i8 %mad2, %mul3 972 %mad4 = add i8 %mad3, %mul4 973 974 store i8 %mad4, i8 addrspace(1)* %dst, align 2 975 ret void 976} 977 978define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, 979; GFX7-LABEL: udot4_multiuse_mul1: 980; GFX7: ; %bb.0: ; %entry 981; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 982; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 983; GFX7-NEXT: s_movk_i32 s8, 0xff 984; GFX7-NEXT: s_mov_b32 s3, 0xf000 985; GFX7-NEXT: s_mov_b32 s2, -1 986; GFX7-NEXT: s_waitcnt lgkmcnt(0) 987; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 988; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 989; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 990; GFX7-NEXT: s_waitcnt lgkmcnt(0) 991; GFX7-NEXT: s_and_b32 s6, s4, s8 992; GFX7-NEXT: s_and_b32 s7, s5, s8 993; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 994; GFX7-NEXT: v_mov_b32_e32 v0, s7 995; GFX7-NEXT: v_mov_b32_e32 v1, s12 996; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 997; GFX7-NEXT: v_mad_u32_u24 v1, s6, v0, v1 998; GFX7-NEXT: v_mov_b32_e32 v2, s9 999; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 1000; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 1001; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 1002; GFX7-NEXT: v_mad_u32_u24 v0, s6, v0, v1 1003; GFX7-NEXT: v_mov_b32_e32 v1, s11 1004; GFX7-NEXT: s_lshr_b32 s5, s5, 24 1005; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 1006; GFX7-NEXT: s_lshr_b32 s4, s4, 24 1007; GFX7-NEXT: v_mov_b32_e32 v1, s5 1008; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 1009; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1010; GFX7-NEXT: s_endpgm 1011; 1012; GFX8-LABEL: udot4_multiuse_mul1: 1013; GFX8: ; %bb.0: ; %entry 1014; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1015; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1016; GFX8-NEXT: s_movk_i32 s2, 0xff 1017; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1018; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 1019; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 1020; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 1021; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1022; GFX8-NEXT: s_and_b32 s5, s3, s2 1023; GFX8-NEXT: s_and_b32 s2, s4, s2 1024; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 1025; GFX8-NEXT: v_mov_b32_e32 v0, s2 1026; GFX8-NEXT: v_mov_b32_e32 v1, s10 1027; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 1028; GFX8-NEXT: v_mad_u32_u24 v1, s5, v0, v1 1029; GFX8-NEXT: v_mov_b32_e32 v2, s7 1030; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 1031; GFX8-NEXT: v_mad_u32_u24 v1, s6, v2, v1 1032; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 1033; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 1034; GFX8-NEXT: v_mov_b32_e32 v1, s9 1035; GFX8-NEXT: s_lshr_b32 s4, s4, 24 1036; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 1037; GFX8-NEXT: s_lshr_b32 s3, s3, 24 1038; GFX8-NEXT: v_mov_b32_e32 v1, s4 1039; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 1040; GFX8-NEXT: v_mov_b32_e32 v0, s0 1041; GFX8-NEXT: v_mov_b32_e32 v1, s1 1042; GFX8-NEXT: flat_store_dword v[0:1], v2 1043; GFX8-NEXT: s_endpgm 1044; 1045; GFX9-NODL-LABEL: udot4_multiuse_mul1: 1046; GFX9-NODL: ; %bb.0: ; %entry 1047; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1048; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1049; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 1050; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1051; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 1053; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 1054; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 1055; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1056; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 1057; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 1058; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 1059; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 1060; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 1061; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 1062; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 1063; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 1064; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 1065; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v3, v2 1066; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 1067; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 1068; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 1069; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 1070; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 1071; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 1072; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 1073; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 1074; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1075; GFX9-NODL-NEXT: s_endpgm 1076; 1077; GFX9-DL-LABEL: udot4_multiuse_mul1: 1078; GFX9-DL: ; %bb.0: ; %entry 1079; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1080; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1081; GFX9-DL-NEXT: s_movk_i32 s2, 0xff 1082; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1083; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1084; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 1085; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 1086; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 1087; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 1089; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 1090; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 1091; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 1092; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 1093; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 1094; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 1095; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 1096; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 1097; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v3, v2 1098; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 1099; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 1100; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 1101; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 1102; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 1103; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 1104; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 1105; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 1106; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1107; GFX9-DL-NEXT: s_endpgm 1108; 1109; GFX10-DL-LABEL: udot4_multiuse_mul1: 1110; GFX10-DL: ; %bb.0: ; %entry 1111; GFX10-DL-NEXT: s_clause 0x1 1112; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1113; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1114; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1115; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1117; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1118; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1119; GFX10-DL-NEXT: s_movk_i32 s2, 0xff 1120; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1121; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1122; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 1123; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 1124; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 1125; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 1126; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 1127; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 1128; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 1129; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 1130; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 1131; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 1132; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 1133; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 1134; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s1, v0 1135; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 1136; GFX10-DL-NEXT: s_endpgm 1137 <4 x i8> addrspace(1)* %src2, 1138 i32 addrspace(1)* nocapture %dst) { 1139entry: 1140 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 1141 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 1142 1143 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1144 %cv1e0 = zext i8 %v1e0 to i32 1145 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1146 %cv2e0 = zext i8 %v2e0 to i32 1147 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1148 1149 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1150 %cv1e1 = zext i8 %v1e1 to i32 1151 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1152 %cv2e1 = zext i8 %v2e1 to i32 1153 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1154 1155 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1156 %cv1e2 = zext i8 %v1e2 to i32 1157 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1158 %cv2e2 = zext i8 %v2e2 to i32 1159 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1160 1161 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1162 %cv1e3 = zext i8 %v1e3 to i32 1163 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1164 %cv2e3 = zext i8 %v2e3 to i32 1165 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 1166 1167 %acc = load i32, i32 addrspace(1)* %dst, align 4 1168 %add = add i32 %mul1, %acc 1169 %add1 = add i32 %mul2, %add 1170 %add2 = add i32 %add1, %mul1 1171 %add3 = add i32 %add2, %mul3 1172 %add4 = add i32 %add3, %mul4 1173 1174 store i32 %add4, i32 addrspace(1)* %dst, align 4 1175 ret void 1176} 1177 1178define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, 1179; GFX7-LABEL: udot4_multiuse_add1: 1180; GFX7: ; %bb.0: ; %entry 1181; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1182; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1183; GFX7-NEXT: s_movk_i32 s8, 0xff 1184; GFX7-NEXT: s_mov_b32 s3, 0xf000 1185; GFX7-NEXT: s_mov_b32 s2, -1 1186; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1188; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1189; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 1190; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1191; GFX7-NEXT: s_and_b32 s6, s4, s8 1192; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 1193; GFX7-NEXT: s_and_b32 s7, s5, s8 1194; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 1195; GFX7-NEXT: v_mov_b32_e32 v0, s9 1196; GFX7-NEXT: v_mov_b32_e32 v1, s12 1197; GFX7-NEXT: v_mad_u32_u24 v0, s8, v0, v1 1198; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 1199; GFX7-NEXT: v_mov_b32_e32 v2, s7 1200; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 1201; GFX7-NEXT: v_add_i32_e32 v1, vcc, s12, v0 1202; GFX7-NEXT: v_mad_u32_u24 v0, s6, v2, v0 1203; GFX7-NEXT: v_mov_b32_e32 v2, s11 1204; GFX7-NEXT: s_lshr_b32 s5, s5, 24 1205; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0 1206; GFX7-NEXT: s_lshr_b32 s4, s4, 24 1207; GFX7-NEXT: v_mov_b32_e32 v2, s5 1208; GFX7-NEXT: v_mad_u32_u24 v0, s4, v2, v0 1209; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1210; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1211; GFX7-NEXT: s_endpgm 1212; 1213; GFX8-LABEL: udot4_multiuse_add1: 1214; GFX8: ; %bb.0: ; %entry 1215; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1216; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1217; GFX8-NEXT: s_movk_i32 s2, 0xff 1218; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1219; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 1220; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 1221; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 1222; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1223; GFX8-NEXT: s_and_b32 s5, s3, s2 1224; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 1225; GFX8-NEXT: s_and_b32 s2, s4, s2 1226; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 1227; GFX8-NEXT: v_mov_b32_e32 v0, s7 1228; GFX8-NEXT: v_mov_b32_e32 v1, s10 1229; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1 1230; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 1231; GFX8-NEXT: v_mov_b32_e32 v2, s2 1232; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 1233; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v0 1234; GFX8-NEXT: v_mad_u32_u24 v0, s5, v2, v0 1235; GFX8-NEXT: v_mov_b32_e32 v2, s9 1236; GFX8-NEXT: s_lshr_b32 s4, s4, 24 1237; GFX8-NEXT: v_mad_u32_u24 v0, s8, v2, v0 1238; GFX8-NEXT: s_lshr_b32 s3, s3, 24 1239; GFX8-NEXT: v_mov_b32_e32 v2, s4 1240; GFX8-NEXT: v_mad_u32_u24 v0, s3, v2, v0 1241; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 1242; GFX8-NEXT: v_mov_b32_e32 v0, s0 1243; GFX8-NEXT: v_mov_b32_e32 v1, s1 1244; GFX8-NEXT: flat_store_dword v[0:1], v2 1245; GFX8-NEXT: s_endpgm 1246; 1247; GFX9-NODL-LABEL: udot4_multiuse_add1: 1248; GFX9-NODL: ; %bb.0: ; %entry 1249; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1250; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1251; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 1252; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1253; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1254; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 1255; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 1256; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 1257; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1258; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 1259; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 1260; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 1261; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 1262; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 1263; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 1264; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v1, v2 1265; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 1266; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 1267; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 1268; GFX9-NODL-NEXT: v_add_u32_e32 v2, s10, v1 1269; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 1270; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s9 1271; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 1272; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v3, v1 1273; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 1274; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 1275; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v3, v1 1276; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v2 1277; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1278; GFX9-NODL-NEXT: s_endpgm 1279; 1280; GFX9-DL-LABEL: udot4_multiuse_add1: 1281; GFX9-DL: ; %bb.0: ; %entry 1282; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1283; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1284; GFX9-DL-NEXT: s_movk_i32 s2, 0xff 1285; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1286; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 1288; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 1289; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 1290; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 1292; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 1293; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 1294; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 1295; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 1296; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 1297; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v1, v2 1298; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 1299; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 1300; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 1301; GFX9-DL-NEXT: v_add_u32_e32 v2, s10, v1 1302; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 1303; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 1304; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 1305; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v3, v1 1306; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 1307; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 1308; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v3, v1 1309; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 1310; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1311; GFX9-DL-NEXT: s_endpgm 1312; 1313; GFX10-DL-LABEL: udot4_multiuse_add1: 1314; GFX10-DL: ; %bb.0: ; %entry 1315; GFX10-DL-NEXT: s_clause 0x1 1316; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1317; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1318; GFX10-DL-NEXT: s_movk_i32 s7, 0xff 1319; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1320; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1321; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 1322; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1323; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1324; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1325; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 1326; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 1327; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 1328; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 1329; GFX10-DL-NEXT: s_and_b32 s2, s0, s7 1330; GFX10-DL-NEXT: s_and_b32 s3, s1, s7 1331; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0 1332; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 1333; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 1334; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 1335; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 1336; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1337; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s6, v0 1338; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 1339; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 1340; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] 1341; GFX10-DL-NEXT: s_endpgm 1342 <4 x i8> addrspace(1)* %src2, 1343 i32 addrspace(1)* nocapture %dst) { 1344entry: 1345 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 1346 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 1347 1348 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1349 %cv1e0 = zext i8 %v1e0 to i32 1350 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1351 %cv2e0 = zext i8 %v2e0 to i32 1352 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1353 1354 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1355 %cv1e1 = zext i8 %v1e1 to i32 1356 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1357 %cv2e1 = zext i8 %v2e1 to i32 1358 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1359 1360 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1361 %cv1e2 = zext i8 %v1e2 to i32 1362 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1363 %cv2e2 = zext i8 %v2e2 to i32 1364 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1365 1366 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1367 %cv1e3 = zext i8 %v1e3 to i32 1368 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1369 %cv2e3 = zext i8 %v2e3 to i32 1370 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 1371 1372 %acc = load i32, i32 addrspace(1)* %dst, align 4 1373 %add1 = add i32 %mul2, %acc 1374 %add = add i32 %add1, %acc 1375 %add2 = add i32 %add1, %mul1 1376 %add3 = add i32 %add2, %mul3 1377 %add4 = add i32 %add3, %mul4 1378 %res = add i32 %add4, %add 1379 store i32 %res, i32 addrspace(1)* %dst, align 4 1380 ret void 1381} 1382 1383define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, 1384; GFX7-LABEL: notdot4_mixedtypes: 1385; GFX7: ; %bb.0: ; %entry 1386; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1387; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1388; GFX7-NEXT: s_mov_b32 s3, 0xf000 1389; GFX7-NEXT: s_mov_b32 s2, -1 1390; GFX7-NEXT: s_mov_b32 s8, 0xffff 1391; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1392; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1393; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1394; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1395; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1396; GFX7-NEXT: s_sext_i32_i8 s6, s4 1397; GFX7-NEXT: s_sext_i32_i8 s7, s5 1398; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 1399; GFX7-NEXT: s_and_b32 s7, s7, s8 1400; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008 1401; GFX7-NEXT: v_mov_b32_e32 v1, s9 1402; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 1403; GFX7-NEXT: s_and_b32 s6, s6, s8 1404; GFX7-NEXT: v_mov_b32_e32 v3, s7 1405; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010 1406; GFX7-NEXT: s_lshr_b32 s5, s5, 24 1407; GFX7-NEXT: v_mov_b32_e32 v2, s11 1408; GFX7-NEXT: s_lshr_b32 s4, s4, 24 1409; GFX7-NEXT: s_waitcnt vmcnt(0) 1410; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 1411; GFX7-NEXT: v_mad_u32_u24 v0, s6, v3, v0 1412; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 1413; GFX7-NEXT: v_mov_b32_e32 v1, s5 1414; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 1415; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 1416; GFX7-NEXT: s_endpgm 1417; 1418; GFX8-LABEL: notdot4_mixedtypes: 1419; GFX8: ; %bb.0: ; %entry 1420; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1421; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1422; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1423; GFX8-NEXT: v_mov_b32_e32 v0, s0 1424; GFX8-NEXT: v_mov_b32_e32 v1, s1 1425; GFX8-NEXT: flat_load_ushort v2, v[0:1] 1426; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1427; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 1428; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1429; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 1430; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 1431; GFX8-NEXT: s_sext_i32_i8 s3, s1 1432; GFX8-NEXT: v_mov_b32_e32 v3, s5 1433; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 1434; GFX8-NEXT: s_sext_i32_i8 s2, s0 1435; GFX8-NEXT: v_mov_b32_e32 v4, s3 1436; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 1437; GFX8-NEXT: s_lshr_b32 s1, s1, 24 1438; GFX8-NEXT: v_mov_b32_e32 v5, s7 1439; GFX8-NEXT: s_lshr_b32 s0, s0, 24 1440; GFX8-NEXT: s_waitcnt vmcnt(0) 1441; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2 1442; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2 1443; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 1444; GFX8-NEXT: v_mov_b32_e32 v3, s1 1445; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 1446; GFX8-NEXT: flat_store_short v[0:1], v2 1447; GFX8-NEXT: s_endpgm 1448; 1449; GFX9-NODL-LABEL: notdot4_mixedtypes: 1450; GFX9-NODL: ; %bb.0: ; %entry 1451; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1452; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1453; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1454; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1455; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] 1456; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 1457; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 1458; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1459; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80008 1460; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 1461; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 1462; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 1463; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 1464; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 1465; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 1466; GFX9-NODL-NEXT: s_bfe_u32 s8, s2, 0x80010 1467; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 1468; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 1469; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 1470; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1471; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 1472; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v3, v1 1473; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 1474; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 1475; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 1476; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] 1477; GFX9-NODL-NEXT: s_endpgm 1478; 1479; GFX9-DL-LABEL: notdot4_mixedtypes: 1480; GFX9-DL: ; %bb.0: ; %entry 1481; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1482; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1483; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1484; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1485; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] 1486; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1487; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 1488; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1489; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x80008 1490; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 1491; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 1492; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 1493; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010 1494; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 1495; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 1496; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x80010 1497; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 1498; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9 1499; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 1500; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1501; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 1502; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v3, v1 1503; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 1504; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 1505; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 1506; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 1507; GFX9-DL-NEXT: s_endpgm 1508; 1509; GFX10-DL-LABEL: notdot4_mixedtypes: 1510; GFX10-DL: ; %bb.0: ; %entry 1511; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1512; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1513; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1514; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1515; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 1516; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1517; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1518; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1519; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 1520; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 1521; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1522; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1523; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 1524; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 1525; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 1526; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 1527; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 1528; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 1529; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 1530; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 1531; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 1532; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 1533; GFX10-DL-NEXT: s_endpgm 1534 <4 x i8> addrspace(1)* %src2, 1535 i16 addrspace(1)* nocapture %dst) { 1536entry: 1537 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 1538 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 1539 1540 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1541 %cv1e0 = sext i8 %v1e0 to i16 1542 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1543 %cv2e0 = sext i8 %v2e0 to i16 1544 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0 1545 1546 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1547 %cv1e1 = zext i8 %v1e1 to i16 1548 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1549 %cv2e1 = zext i8 %v2e1 to i16 1550 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1 1551 1552 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1553 %cv1e2 = zext i8 %v1e2 to i16 1554 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1555 %cv2e2 = zext i8 %v2e2 to i16 1556 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2 1557 1558 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1559 %cv1e3 = zext i8 %v1e3 to i16 1560 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1561 %cv2e3 = zext i8 %v2e3 to i16 1562 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3 1563 1564 %acc = load i16, i16 addrspace(1)* %dst, align 2 1565 %add1 = add i16 %mul2, %acc 1566 %add2 = add i16 %add1, %mul1 1567 %add3 = add i16 %add2, %mul3 1568 %add4 = add i16 %add3, %mul4 1569 1570 store i16 %add4, i16 addrspace(1)* %dst, align 2 1571 ret void 1572} 1573 1574; TODO: cleanup s_lshr_b32 and support this pattern. 1575define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, 1576; GFX7-LABEL: udot4_acc32_vecMul: 1577; GFX7: ; %bb.0: ; %entry 1578; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1579; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1580; GFX7-NEXT: s_movk_i32 s11, 0xff 1581; GFX7-NEXT: s_mov_b32 s3, 0xf000 1582; GFX7-NEXT: s_mov_b32 s2, -1 1583; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1584; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1585; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1586; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1587; GFX7-NEXT: s_lshr_b32 s6, s4, 24 1588; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 1589; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 1590; GFX7-NEXT: s_lshr_b32 s8, s5, 24 1591; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 1592; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 1593; GFX7-NEXT: s_and_b32 s5, s5, s11 1594; GFX7-NEXT: s_and_b32 s4, s4, s11 1595; GFX7-NEXT: s_load_dword s11, s[0:1], 0x0 1596; GFX7-NEXT: v_mov_b32_e32 v0, s5 1597; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX7-NEXT: v_mov_b32_e32 v1, s11 1599; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 1600; GFX7-NEXT: v_mov_b32_e32 v1, s9 1601; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 1602; GFX7-NEXT: v_mov_b32_e32 v1, s12 1603; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 1604; GFX7-NEXT: v_mov_b32_e32 v1, s8 1605; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 1606; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1607; GFX7-NEXT: s_endpgm 1608; 1609; GFX8-LABEL: udot4_acc32_vecMul: 1610; GFX8: ; %bb.0: ; %entry 1611; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1612; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1613; GFX8-NEXT: s_movk_i32 s2, 0xff 1614; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1615; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 1616; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 1617; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1618; GFX8-NEXT: s_lshr_b32 s5, s3, 24 1619; GFX8-NEXT: s_lshr_b32 s6, s4, 24 1620; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80010 1621; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3 1622; GFX8-NEXT: s_and_b32 s3, s3, s2 1623; GFX8-NEXT: s_and_b32 s2, s4, s2 1624; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80010 1625; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s4 1626; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 1627; GFX8-NEXT: v_mov_b32_e32 v2, s2 1628; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1629; GFX8-NEXT: v_mov_b32_e32 v3, s4 1630; GFX8-NEXT: v_mad_u32_u24 v2, s3, v2, v3 1631; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v2 1632; GFX8-NEXT: v_mov_b32_e32 v1, s8 1633; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 1634; GFX8-NEXT: v_mov_b32_e32 v1, s6 1635; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 1636; GFX8-NEXT: v_mov_b32_e32 v0, s0 1637; GFX8-NEXT: v_mov_b32_e32 v1, s1 1638; GFX8-NEXT: flat_store_dword v[0:1], v2 1639; GFX8-NEXT: s_endpgm 1640; 1641; GFX9-NODL-LABEL: udot4_acc32_vecMul: 1642; GFX9-NODL: ; %bb.0: ; %entry 1643; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1644; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1645; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 1646; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1647; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1648; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 1649; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 1650; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1651; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 1652; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24 1653; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010 1654; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 1655; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2 1656; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 1657; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 1658; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s4 1659; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 1660; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 1661; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1662; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 1663; GFX9-NODL-NEXT: v_mad_u32_u24 v3, s3, v3, v4 1664; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, v3 1665; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 1666; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 1667; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 1668; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 1669; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 1670; GFX9-NODL-NEXT: s_endpgm 1671; 1672; GFX9-DL-LABEL: udot4_acc32_vecMul: 1673; GFX9-DL: ; %bb.0: ; %entry 1674; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1675; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1676; GFX9-DL-NEXT: s_movk_i32 s2, 0xff 1677; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1678; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1679; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 1680; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 1681; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1682; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 1683; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24 1684; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 1685; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 1686; GFX9-DL-NEXT: s_and_b32 s3, s3, s2 1687; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 1688; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010 1689; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s4 1690; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 1691; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 1692; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1693; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 1694; GFX9-DL-NEXT: v_mad_u32_u24 v3, s3, v3, v4 1695; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, v3 1696; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 1697; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 1698; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 1699; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 1700; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 1701; GFX9-DL-NEXT: s_endpgm 1702; 1703; GFX10-DL-LABEL: udot4_acc32_vecMul: 1704; GFX10-DL: ; %bb.0: ; %entry 1705; GFX10-DL-NEXT: s_clause 0x1 1706; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1707; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1708; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1709; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1710; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 1711; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 1712; GFX10-DL-NEXT: s_movk_i32 s6, 0xff 1713; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff 1714; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1715; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 1716; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 1717; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 1718; GFX10-DL-NEXT: s_and_b32 s4, s2, s6 1719; GFX10-DL-NEXT: s_and_b32 s6, s3, s6 1720; GFX10-DL-NEXT: v_and_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 1721; GFX10-DL-NEXT: v_and_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 1722; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s6, v2 1723; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 1724; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 1725; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 1726; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 1727; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 1728; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 1729; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 1730; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 1731; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] 1732; GFX10-DL-NEXT: s_endpgm 1733 <4 x i8> addrspace(1)* %src2, 1734 i32 addrspace(1)* nocapture %dst) { 1735entry: 1736 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 1737 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 1738 1739 %cvec1 = zext <4 x i8> %vec1 to <4 x i32> 1740 %cvec2 = zext <4 x i8> %vec2 to <4 x i32> 1741 1742 %mul = mul <4 x i32> %cvec1, %cvec2 1743 %mul0 = extractelement <4 x i32> %mul, i64 0 1744 %mul1 = extractelement <4 x i32> %mul, i64 1 1745 %mul2 = extractelement <4 x i32> %mul, i64 2 1746 %mul3 = extractelement <4 x i32> %mul, i64 3 1747 1748 %acc = load i32, i32 addrspace(1)* %dst, align 4 1749 %add1 = add i32 %mul0, %acc 1750 %add2 = add i32 %add1, %mul1 1751 %add3 = add i32 %add2, %mul2 1752 %add4 = add i32 %add3, %mul3 1753 1754 store i32 %add4, i32 addrspace(1)* %dst, align 4 1755 ret void 1756} 1757 1758; TODO: This pattern should be recognized. 1759define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, 1760; GFX7-LABEL: udot4_acc16_vecMul: 1761; GFX7: ; %bb.0: ; %entry 1762; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1763; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1764; GFX7-NEXT: s_mov_b32 s3, 0xf000 1765; GFX7-NEXT: s_mov_b32 s2, -1 1766; GFX7-NEXT: s_movk_i32 s8, 0xff 1767; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1769; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1770; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1771; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1772; GFX7-NEXT: s_lshr_b32 s6, s4, 24 1773; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 1774; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 1775; GFX7-NEXT: s_lshr_b32 s9, s5, 24 1776; GFX7-NEXT: s_and_b32 s5, s5, s8 1777; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 1778; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 1779; GFX7-NEXT: s_and_b32 s4, s4, s8 1780; GFX7-NEXT: v_mov_b32_e32 v1, s5 1781; GFX7-NEXT: v_mov_b32_e32 v2, s10 1782; GFX7-NEXT: v_mov_b32_e32 v3, s12 1783; GFX7-NEXT: s_waitcnt vmcnt(0) 1784; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 1785; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 1786; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 1787; GFX7-NEXT: v_mov_b32_e32 v1, s9 1788; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 1789; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 1790; GFX7-NEXT: s_endpgm 1791; 1792; GFX8-LABEL: udot4_acc16_vecMul: 1793; GFX8: ; %bb.0: ; %entry 1794; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1795; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1796; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1797; GFX8-NEXT: v_mov_b32_e32 v0, s0 1798; GFX8-NEXT: v_mov_b32_e32 v1, s1 1799; GFX8-NEXT: flat_load_ushort v2, v[0:1] 1800; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 1801; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 1802; GFX8-NEXT: s_movk_i32 s0, 0xff 1803; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1804; GFX8-NEXT: s_and_b32 s6, s1, s0 1805; GFX8-NEXT: s_and_b32 s0, s2, s0 1806; GFX8-NEXT: v_mov_b32_e32 v5, s0 1807; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 1808; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2 1809; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1 1810; GFX8-NEXT: s_lshr_b32 s4, s2, 24 1811; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80010 1812; GFX8-NEXT: v_mov_b32_e32 v6, s7 1813; GFX8-NEXT: s_lshr_b32 s3, s1, 24 1814; GFX8-NEXT: s_waitcnt vmcnt(0) 1815; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 1816; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2 1817; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 1818; GFX8-NEXT: v_mov_b32_e32 v3, s4 1819; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 1820; GFX8-NEXT: flat_store_short v[0:1], v2 1821; GFX8-NEXT: s_endpgm 1822; 1823; GFX9-NODL-LABEL: udot4_acc16_vecMul: 1824; GFX9-NODL: ; %bb.0: ; %entry 1825; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1826; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1827; GFX9-NODL-NEXT: v_mov_b32_e32 v3, 0xffff 1828; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1829; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1830; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 1831; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 1832; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1833; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16 1834; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16 1835; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 1836; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1837; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 1838; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1839; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s6, 16, v4 1840; GFX9-NODL-NEXT: v_lshl_or_b32 v5, s4, 16, v5 1841; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 1842; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1843; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 1844; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 1845; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1846; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 1847; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 1848; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 1849; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1] 1850; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1851; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2 1852; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1853; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v4 1854; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1855; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] 1856; GFX9-NODL-NEXT: s_endpgm 1857; 1858; GFX9-DL-LABEL: udot4_acc16_vecMul: 1859; GFX9-DL: ; %bb.0: ; %entry 1860; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1861; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1862; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0xffff 1863; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1864; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1865; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 1866; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 1867; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1868; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16 1869; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16 1870; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 1871; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1872; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 1873; GFX9-DL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1874; GFX9-DL-NEXT: v_lshl_or_b32 v4, s6, 16, v4 1875; GFX9-DL-NEXT: v_lshl_or_b32 v5, s4, 16, v5 1876; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 1877; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1878; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 1879; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 1880; GFX9-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1881; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 1882; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 1883; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 1884; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1] 1885; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1886; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2 1887; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1888; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 1889; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1890; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 1891; GFX9-DL-NEXT: s_endpgm 1892; 1893; GFX10-DL-LABEL: udot4_acc16_vecMul: 1894; GFX10-DL: ; %bb.0: ; %entry 1895; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1896; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1897; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1898; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff 1899; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1900; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 1901; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1902; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1903; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1904; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 1905; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1906; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 1907; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1908; GFX10-DL-NEXT: s_lshr_b32 s2, s1, 16 1909; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 1910; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 1911; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 1912; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 1913; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1914; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1915; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 1916; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 1917; GFX10-DL-NEXT: v_lshl_or_b32 v4, s1, 16, v5 1918; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2 1919; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 1920; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1921; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1 1922; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1923; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 1924; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1925; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 1926; GFX10-DL-NEXT: s_endpgm 1927 <4 x i8> addrspace(1)* %src2, 1928 i16 addrspace(1)* nocapture %dst) { 1929entry: 1930 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 1931 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 1932 1933 %cvec1 = zext <4 x i8> %vec1 to <4 x i16> 1934 %cvec2 = zext <4 x i8> %vec2 to <4 x i16> 1935 1936 %mul = mul <4 x i16> %cvec1, %cvec2 1937 %mul0 = extractelement <4 x i16> %mul, i64 0 1938 %mul1 = extractelement <4 x i16> %mul, i64 1 1939 %mul2 = extractelement <4 x i16> %mul, i64 2 1940 %mul3 = extractelement <4 x i16> %mul, i64 3 1941 1942 %acc = load i16, i16 addrspace(1)* %dst, align 4 1943 %add1 = add i16 %mul0, %acc 1944 %add2 = add i16 %add1, %mul1 1945 %add3 = add i16 %add2, %mul2 1946 %add4 = add i16 %add3, %mul3 1947 1948 store i16 %add4, i16 addrspace(1)* %dst, align 4 1949 ret void 1950} 1951 1952; TODO: Support this pattern. 1953define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, 1954; GFX7-LABEL: udot4_acc8_vecMul: 1955; GFX7: ; %bb.0: ; %entry 1956; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1957; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1958; GFX7-NEXT: s_mov_b32 s3, 0xf000 1959; GFX7-NEXT: s_mov_b32 s2, -1 1960; GFX7-NEXT: s_movk_i32 s8, 0xff 1961; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1962; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 1963; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 1964; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 1965; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1966; GFX7-NEXT: s_bfe_u32 s6, s4, 0x80008 1967; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 1968; GFX7-NEXT: s_lshr_b32 s11, s5, 16 1969; GFX7-NEXT: s_lshr_b32 s12, s5, 24 1970; GFX7-NEXT: v_mov_b32_e32 v3, s10 1971; GFX7-NEXT: s_lshr_b32 s7, s4, 16 1972; GFX7-NEXT: v_mov_b32_e32 v2, s11 1973; GFX7-NEXT: s_lshr_b32 s9, s4, 24 1974; GFX7-NEXT: v_mov_b32_e32 v1, s12 1975; GFX7-NEXT: s_mul_i32 s4, s4, s5 1976; GFX7-NEXT: v_mul_u32_u24_e32 v1, s9, v1 1977; GFX7-NEXT: v_mul_u32_u24_e32 v2, s7, v2 1978; GFX7-NEXT: v_mul_u32_u24_e32 v3, s6, v3 1979; GFX7-NEXT: s_and_b32 s5, s4, s8 1980; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1981; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 1982; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1983; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 1984; GFX7-NEXT: v_or_b32_e32 v2, s5, v3 1985; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1986; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1987; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 1988; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v1 1989; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1990; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1991; GFX7-NEXT: s_waitcnt vmcnt(0) 1992; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 1993; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1994; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 1995; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1996; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 1997; GFX7-NEXT: s_endpgm 1998; 1999; GFX8-LABEL: udot4_acc8_vecMul: 2000; GFX8: ; %bb.0: ; %entry 2001; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2002; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2003; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2004; GFX8-NEXT: v_mov_b32_e32 v0, s0 2005; GFX8-NEXT: v_mov_b32_e32 v1, s1 2006; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 2007; GFX8-NEXT: s_movk_i32 s0, 0xff 2008; GFX8-NEXT: v_mov_b32_e32 v3, s0 2009; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2010; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 2011; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2012; GFX8-NEXT: s_lshr_b32 s2, s0, 24 2013; GFX8-NEXT: s_lshr_b32 s4, s1, 24 2014; GFX8-NEXT: s_lshr_b32 s3, s0, 16 2015; GFX8-NEXT: v_mov_b32_e32 v4, s0 2016; GFX8-NEXT: v_mov_b32_e32 v5, s1 2017; GFX8-NEXT: s_mul_i32 s0, s0, s1 2018; GFX8-NEXT: s_lshr_b32 s5, s1, 16 2019; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2020; GFX8-NEXT: v_mov_b32_e32 v5, s5 2021; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 2022; GFX8-NEXT: v_mov_b32_e32 v6, s4 2023; GFX8-NEXT: v_mov_b32_e32 v7, s2 2024; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 2025; GFX8-NEXT: v_mul_u32_u24_e32 v5, s3, v5 2026; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2027; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 2028; GFX8-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2029; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 2030; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v4 2031; GFX8-NEXT: s_waitcnt vmcnt(0) 2032; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 2033; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 2034; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2035; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2036; GFX8-NEXT: flat_store_byte v[0:1], v2 2037; GFX8-NEXT: s_endpgm 2038; 2039; GFX9-NODL-LABEL: udot4_acc8_vecMul: 2040; GFX9-NODL: ; %bb.0: ; %entry 2041; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2042; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2043; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2044; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2045; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 2046; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 2047; GFX9-NODL-NEXT: global_load_ubyte v4, v0, s[0:1] 2048; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2049; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 2050; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 2051; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 2052; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 2053; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 2054; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s2, v1 2055; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2056; GFX9-NODL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2057; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 2058; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 2059; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 2060; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2061; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 2062; GFX9-NODL-NEXT: v_and_b32_e32 v1, 0xffff, v1 2063; GFX9-NODL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2064; GFX9-NODL-NEXT: v_or_b32_e32 v2, v1, v2 2065; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 2066; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2067; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v4 2068; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3 2069; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2070; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2071; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] 2072; GFX9-NODL-NEXT: s_endpgm 2073; 2074; GFX9-DL-LABEL: udot4_acc8_vecMul: 2075; GFX9-DL: ; %bb.0: ; %entry 2076; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2077; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2078; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2079; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2080; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 2081; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 2082; GFX9-DL-NEXT: global_load_ubyte v4, v0, s[0:1] 2083; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2084; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 2085; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 2086; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 2087; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 2088; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 2089; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s2, v1 2090; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2091; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2092; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 2093; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 2094; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 2095; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2096; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 2097; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 2098; GFX9-DL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2099; GFX9-DL-NEXT: v_or_b32_e32 v2, v1, v2 2100; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 2101; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2102; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 2103; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 2104; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2105; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2106; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 2107; GFX9-DL-NEXT: s_endpgm 2108; 2109; GFX10-DL-LABEL: udot4_acc8_vecMul: 2110; GFX10-DL: ; %bb.0: ; %entry 2111; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 2112; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 2113; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2114; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2115; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 2116; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 2117; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 2118; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2119; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s0 2120; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s1 2121; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 2122; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 2123; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3 2124; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3 2125; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 2126; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 2127; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 2128; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 2129; GFX10-DL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2130; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v4 2131; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 2132; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 2133; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2134; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 2135; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 2136; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2137; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 2138; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v4 2139; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2140; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 2141; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 2142; GFX10-DL-NEXT: s_endpgm 2143 <4 x i8> addrspace(1)* %src2, 2144 i8 addrspace(1)* nocapture %dst) { 2145entry: 2146 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 2147 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 2148 2149 %mul = mul <4 x i8> %vec1, %vec2 2150 %mul0 = extractelement <4 x i8> %mul, i64 0 2151 %mul1 = extractelement <4 x i8> %mul, i64 1 2152 %mul2 = extractelement <4 x i8> %mul, i64 2 2153 %mul3 = extractelement <4 x i8> %mul, i64 3 2154 2155 %acc = load i8, i8 addrspace(1)* %dst, align 4 2156 %add1 = add i8 %mul0, %acc 2157 %add2 = add i8 %add1, %mul1 2158 %add3 = add i8 %add2, %mul2 2159 %add4 = add i8 %add3, %mul3 2160 2161 store i8 %add4, i8 addrspace(1)* %dst, align 4 2162 ret void 2163} 2164