1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s 8 9define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, 10; GFX7-LABEL: idot4_acc32: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 13; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 14; GFX7-NEXT: s_mov_b32 s3, 0xf000 15; GFX7-NEXT: s_mov_b32 s2, -1 16; GFX7-NEXT: s_waitcnt lgkmcnt(0) 17; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 18; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 19; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: s_sext_i32_i8 s6, s4 22; GFX7-NEXT: s_sext_i32_i8 s7, s5 23; GFX7-NEXT: s_bfe_i32 s9, s5, 0x80008 24; GFX7-NEXT: v_mov_b32_e32 v0, s7 25; GFX7-NEXT: v_mov_b32_e32 v1, s12 26; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80010 27; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 28; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 29; GFX7-NEXT: v_mov_b32_e32 v1, s9 30; GFX7-NEXT: s_bfe_i32 s10, s4, 0x80010 31; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 32; GFX7-NEXT: v_mov_b32_e32 v1, s11 33; GFX7-NEXT: s_ashr_i32 s5, s5, 24 34; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 35; GFX7-NEXT: s_ashr_i32 s4, s4, 24 36; GFX7-NEXT: v_mov_b32_e32 v1, s5 37; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 38; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 39; GFX7-NEXT: s_endpgm 40; 41; GFX8-LABEL: idot4_acc32: 42; GFX8: ; %bb.0: ; %entry 43; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 44; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 45; GFX8-NEXT: s_waitcnt lgkmcnt(0) 46; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 47; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 48; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 49; GFX8-NEXT: s_waitcnt lgkmcnt(0) 50; GFX8-NEXT: s_sext_i32_i8 s4, s2 51; GFX8-NEXT: s_sext_i32_i8 s5, s3 52; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80008 53; GFX8-NEXT: v_mov_b32_e32 v0, s5 54; GFX8-NEXT: v_mov_b32_e32 v1, s10 55; GFX8-NEXT: s_bfe_i32 s9, s3, 0x80010 56; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 57; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80008 58; GFX8-NEXT: v_mov_b32_e32 v1, s7 59; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 60; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 61; GFX8-NEXT: v_mov_b32_e32 v1, s9 62; GFX8-NEXT: s_ashr_i32 s3, s3, 24 63; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 64; GFX8-NEXT: s_ashr_i32 s2, s2, 24 65; GFX8-NEXT: v_mov_b32_e32 v1, s3 66; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 67; GFX8-NEXT: v_mov_b32_e32 v0, s0 68; GFX8-NEXT: v_mov_b32_e32 v1, s1 69; GFX8-NEXT: flat_store_dword v[0:1], v2 70; GFX8-NEXT: s_endpgm 71; 72; GFX9-NODL-LABEL: idot4_acc32: 73; GFX9-NODL: ; %bb.0: ; %entry 74; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 75; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 76; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 77; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 78; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 79; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 80; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 81; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 82; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 83; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 84; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 85; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 86; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 87; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 88; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 89; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 90; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 91; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 92; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 93; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 94; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 95; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 96; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 97; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 98; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 99; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 100; GFX9-NODL-NEXT: s_endpgm 101; 102; GFX9-DL-LABEL: idot4_acc32: 103; GFX9-DL: ; %bb.0: ; %entry 104; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 105; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 106; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 107; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 108; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 109; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 110; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 111; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 112; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 113; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 114; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s4, v1, v2 115; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 116; GFX9-DL-NEXT: s_endpgm 117; 118; GFX10-DL-LABEL: idot4_acc32: 119; GFX10-DL: ; %bb.0: ; %entry 120; GFX10-DL-NEXT: s_clause 0x1 121; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 122; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 123; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 124; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 125; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 126; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 127; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 128; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 129; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 130; GFX10-DL-NEXT: v_dot4_i32_i8 v0, s0, s1, v0 131; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 132; GFX10-DL-NEXT: s_endpgm 133 <4 x i8> addrspace(1)* %src2, 134 i32 addrspace(1)* nocapture %dst) { 135entry: 136 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 137 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 138 139 %v1e0 = extractelement <4 x i8> %vec1, i64 0 140 %cv1e0 = sext i8 %v1e0 to i32 141 %v2e0 = extractelement <4 x i8> %vec2, i64 0 142 %cv2e0 = sext i8 %v2e0 to i32 143 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 144 145 %v1e1 = extractelement <4 x i8> %vec1, i64 1 146 %cv1e1 = sext i8 %v1e1 to i32 147 %v2e1 = extractelement <4 x i8> %vec2, i64 1 148 %cv2e1 = sext i8 %v2e1 to i32 149 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 150 151 %v1e2 = extractelement <4 x i8> %vec1, i64 2 152 %cv1e2 = sext i8 %v1e2 to i32 153 %v2e2 = extractelement <4 x i8> %vec2, i64 2 154 %cv2e2 = sext i8 %v2e2 to i32 155 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 156 157 %v1e3 = extractelement <4 x i8> %vec1, i64 3 158 %cv1e3 = sext i8 %v1e3 to i32 159 %v2e3 = extractelement <4 x i8> %vec2, i64 3 160 %cv2e3 = sext i8 %v2e3 to i32 161 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 162 163 %acc = load i32, i32 addrspace(1)* %dst, align 4 164 %add1 = add i32 %mul1, %acc 165 %add2 = add i32 %add1, %mul2 166 %add3 = add i32 %add2, %mul3 167 %add4 = add i32 %add3, %mul4 168 store i32 %add4, i32 addrspace(1)* %dst, align 4 169 ret void 170} 171 172; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should 173; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern. 174define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, 175; GFX7-LABEL: idot4_acc16: 176; GFX7: ; %bb.0: ; %entry 177; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 178; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 179; GFX7-NEXT: s_mov_b32 s3, 0xf000 180; GFX7-NEXT: s_mov_b32 s2, -1 181; GFX7-NEXT: s_mov_b32 s8, 0xffff 182; GFX7-NEXT: s_waitcnt lgkmcnt(0) 183; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 184; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 185; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 186; GFX7-NEXT: s_waitcnt lgkmcnt(0) 187; GFX7-NEXT: s_sext_i32_i8 s6, s4 188; GFX7-NEXT: s_sext_i32_i8 s7, s5 189; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008 190; GFX7-NEXT: s_and_b32 s7, s7, s8 191; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010 192; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008 193; GFX7-NEXT: s_and_b32 s10, s10, s8 194; GFX7-NEXT: s_and_b32 s6, s6, s8 195; GFX7-NEXT: v_mov_b32_e32 v1, s7 196; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010 197; GFX7-NEXT: s_ashr_i32 s5, s5, 24 198; GFX7-NEXT: s_and_b32 s12, s12, s8 199; GFX7-NEXT: s_and_b32 s9, s9, s8 200; GFX7-NEXT: v_mov_b32_e32 v2, s10 201; GFX7-NEXT: s_ashr_i32 s4, s4, 24 202; GFX7-NEXT: s_and_b32 s11, s11, s8 203; GFX7-NEXT: s_and_b32 s5, s5, s8 204; GFX7-NEXT: v_mov_b32_e32 v3, s12 205; GFX7-NEXT: s_and_b32 s4, s4, s8 206; GFX7-NEXT: s_waitcnt vmcnt(0) 207; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 208; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 209; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 210; GFX7-NEXT: v_mov_b32_e32 v1, s5 211; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 212; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 213; GFX7-NEXT: s_endpgm 214; 215; GFX8-LABEL: idot4_acc16: 216; GFX8: ; %bb.0: ; %entry 217; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 218; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 219; GFX8-NEXT: s_waitcnt lgkmcnt(0) 220; GFX8-NEXT: v_mov_b32_e32 v0, s0 221; GFX8-NEXT: v_mov_b32_e32 v1, s1 222; GFX8-NEXT: flat_load_ushort v2, v[0:1] 223; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 224; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: s_sext_i32_i8 s2, s0 227; GFX8-NEXT: s_sext_i32_i8 s3, s1 228; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80008 229; GFX8-NEXT: v_mov_b32_e32 v3, s3 230; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010 231; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 232; GFX8-NEXT: v_mov_b32_e32 v4, s5 233; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010 234; GFX8-NEXT: s_ashr_i32 s1, s1, 24 235; GFX8-NEXT: v_mov_b32_e32 v5, s7 236; GFX8-NEXT: s_ashr_i32 s0, s0, 24 237; GFX8-NEXT: s_waitcnt vmcnt(0) 238; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 239; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 240; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2 241; GFX8-NEXT: v_mov_b32_e32 v3, s1 242; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 243; GFX8-NEXT: flat_store_short v[0:1], v2 244; GFX8-NEXT: s_endpgm 245; 246; GFX9-NODL-LABEL: idot4_acc16: 247; GFX9-NODL: ; %bb.0: ; %entry 248; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 249; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 250; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 251; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 252; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] 253; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 254; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 255; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 256; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 257; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 258; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 259; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 260; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 261; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 262; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 263; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 264; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 265; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 266; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 267; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 268; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 269; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v3, v1 270; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v4, v1 271; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 272; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 273; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] 274; GFX9-NODL-NEXT: s_endpgm 275; 276; GFX9-DL-LABEL: idot4_acc16: 277; GFX9-DL: ; %bb.0: ; %entry 278; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 279; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 280; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 281; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 282; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 283; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 284; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] 285; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 286; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 287; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 288; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s2, v2, v1 289; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 290; GFX9-DL-NEXT: s_endpgm 291; 292; GFX10-DL-LABEL: idot4_acc16: 293; GFX10-DL: ; %bb.0: ; %entry 294; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 295; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 296; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 297; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 298; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 299; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 300; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 301; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 302; GFX10-DL-NEXT: v_dot4_i32_i8 v1, s0, s1, v1 303; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 304; GFX10-DL-NEXT: s_endpgm 305 <4 x i8> addrspace(1)* %src2, 306 i16 addrspace(1)* nocapture %dst) { 307entry: 308 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 309 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 310 311 %v1e0 = extractelement <4 x i8> %vec1, i64 0 312 %cv1e0 = sext i8 %v1e0 to i16 313 %v2e0 = extractelement <4 x i8> %vec2, i64 0 314 %cv2e0 = sext i8 %v2e0 to i16 315 %mul1 = mul nsw i16 %cv1e0, %cv2e0 316 317 %v1e1 = extractelement <4 x i8> %vec1, i64 1 318 %cv1e1 = sext i8 %v1e1 to i16 319 %v2e1 = extractelement <4 x i8> %vec2, i64 1 320 %cv2e1 = sext i8 %v2e1 to i16 321 %mul2 = mul nsw i16 %cv1e1, %cv2e1 322 323 %v1e2 = extractelement <4 x i8> %vec1, i64 2 324 %cv1e2 = sext i8 %v1e2 to i16 325 %v2e2 = extractelement <4 x i8> %vec2, i64 2 326 %cv2e2 = sext i8 %v2e2 to i16 327 %mul3 = mul nsw i16 %cv1e2, %cv2e2 328 329 %v1e3 = extractelement <4 x i8> %vec1, i64 3 330 %cv1e3 = sext i8 %v1e3 to i16 331 %v2e3 = extractelement <4 x i8> %vec2, i64 3 332 %cv2e3 = sext i8 %v2e3 to i16 333 %mul4 = mul nsw i16 %cv1e3, %cv2e3 334 335 %acc = load i16, i16 addrspace(1)* %dst, align 2 336 %add1 = add i16 %mul1, %acc 337 %add2 = add i16 %add1, %mul2 338 %add3 = add i16 %add2, %mul3 339 %add4 = add i16 %add3, %mul4 340 store i16 %add4, i16 addrspace(1)* %dst, align 2 341 ret void 342} 343 344define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, 345; GFX7-LABEL: idot4_acc8: 346; GFX7: ; %bb.0: ; %entry 347; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 348; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 349; GFX7-NEXT: s_mov_b32 s3, 0xf000 350; GFX7-NEXT: s_mov_b32 s2, -1 351; GFX7-NEXT: s_movk_i32 s8, 0xff 352; GFX7-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 354; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 355; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 356; GFX7-NEXT: s_waitcnt lgkmcnt(0) 357; GFX7-NEXT: s_and_b32 s7, s4, s8 358; GFX7-NEXT: s_and_b32 s6, s5, s8 359; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 360; GFX7-NEXT: v_mov_b32_e32 v1, s6 361; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 362; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 363; GFX7-NEXT: v_mov_b32_e32 v2, s8 364; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 365; GFX7-NEXT: s_lshr_b32 s5, s5, 24 366; GFX7-NEXT: v_mov_b32_e32 v3, s10 367; GFX7-NEXT: s_lshr_b32 s4, s4, 24 368; GFX7-NEXT: s_waitcnt vmcnt(0) 369; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 370; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 371; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 372; GFX7-NEXT: v_mov_b32_e32 v1, s5 373; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 374; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 375; GFX7-NEXT: s_endpgm 376; 377; GFX8-LABEL: idot4_acc8: 378; GFX8: ; %bb.0: ; %entry 379; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 380; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 381; GFX8-NEXT: s_waitcnt lgkmcnt(0) 382; GFX8-NEXT: v_mov_b32_e32 v0, s0 383; GFX8-NEXT: v_mov_b32_e32 v1, s1 384; GFX8-NEXT: flat_load_ubyte v2, v[0:1] 385; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 386; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 387; GFX8-NEXT: s_movk_i32 s0, 0xff 388; GFX8-NEXT: s_waitcnt lgkmcnt(0) 389; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 390; GFX8-NEXT: s_and_b32 s3, s2, s0 391; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 392; GFX8-NEXT: s_and_b32 s0, s1, s0 393; GFX8-NEXT: v_mov_b32_e32 v3, s3 394; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 395; GFX8-NEXT: v_mov_b32_e32 v4, s4 396; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 397; GFX8-NEXT: s_lshr_b32 s2, s2, 24 398; GFX8-NEXT: v_mov_b32_e32 v5, s6 399; GFX8-NEXT: s_lshr_b32 s1, s1, 24 400; GFX8-NEXT: s_waitcnt vmcnt(0) 401; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 402; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 403; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 404; GFX8-NEXT: v_mov_b32_e32 v3, s2 405; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 406; GFX8-NEXT: flat_store_byte v[0:1], v2 407; GFX8-NEXT: s_endpgm 408; 409; GFX9-NODL-LABEL: idot4_acc8: 410; GFX9-NODL: ; %bb.0: ; %entry 411; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 412; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 413; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 414; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff 415; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 416; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] 417; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 418; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 419; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 420; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 421; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 422; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008 423; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 424; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 425; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 426; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 427; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 428; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 429; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 430; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 431; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 432; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 433; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 434; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 435; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 436; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 437; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] 438; GFX9-NODL-NEXT: s_endpgm 439; 440; GFX9-DL-LABEL: idot4_acc8: 441; GFX9-DL: ; %bb.0: ; %entry 442; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 443; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 444; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 445; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 446; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 447; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 448; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] 449; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 450; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 451; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 452; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 453; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] 454; GFX9-DL-NEXT: s_endpgm 455; 456; GFX10-DL-LABEL: idot4_acc8: 457; GFX10-DL: ; %bb.0: ; %entry 458; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 459; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 460; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 461; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 462; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] 463; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 464; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 465; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 466; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 467; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] 468; GFX10-DL-NEXT: s_endpgm 469 <4 x i8> addrspace(1)* %src2, 470 i8 addrspace(1)* nocapture %dst) { 471entry: 472 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 473 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 474 475 %v1e0 = extractelement <4 x i8> %vec1, i64 0 476 %v2e0 = extractelement <4 x i8> %vec2, i64 0 477 %mul1 = mul i8 %v1e0, %v2e0 478 479 %v1e1 = extractelement <4 x i8> %vec1, i64 1 480 %v2e1 = extractelement <4 x i8> %vec2, i64 1 481 %mul2 = mul i8 %v1e1, %v2e1 482 483 %v1e2 = extractelement <4 x i8> %vec1, i64 2 484 %v2e2 = extractelement <4 x i8> %vec2, i64 2 485 %mul3 = mul i8 %v1e2, %v2e2 486 487 %v1e3 = extractelement <4 x i8> %vec1, i64 3 488 %v2e3 = extractelement <4 x i8> %vec2, i64 3 489 %mul4 = mul i8 %v1e3, %v2e3 490 491 %acc = load i8, i8 addrspace(1)* %dst, align 2 492 %add1 = add i8 %mul1, %acc 493 %add2 = add i8 %add1, %mul2 494 %add3 = add i8 %add2, %mul3 495 %add4 = add nsw i8 %add3, %mul4 496 store i8 %add4, i8 addrspace(1)* %dst, align 2 497 ret void 498} 499 500define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, 501; GFX7-LABEL: idot4_multiuse_mul1: 502; GFX7: ; %bb.0: ; %entry 503; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 504; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 505; GFX7-NEXT: s_mov_b32 s3, 0xf000 506; GFX7-NEXT: s_mov_b32 s2, -1 507; GFX7-NEXT: s_waitcnt lgkmcnt(0) 508; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 509; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 510; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 511; GFX7-NEXT: s_waitcnt lgkmcnt(0) 512; GFX7-NEXT: s_sext_i32_i8 s6, s4 513; GFX7-NEXT: s_sext_i32_i8 s7, s5 514; GFX7-NEXT: s_bfe_i32 s9, s5, 0x80008 515; GFX7-NEXT: v_mov_b32_e32 v0, s7 516; GFX7-NEXT: v_mov_b32_e32 v1, s12 517; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 518; GFX7-NEXT: v_mad_i32_i24 v1, s6, v0, v1 519; GFX7-NEXT: v_mov_b32_e32 v2, s9 520; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80010 521; GFX7-NEXT: v_mad_i32_i24 v1, s8, v2, v1 522; GFX7-NEXT: s_bfe_i32 s10, s4, 0x80010 523; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 524; GFX7-NEXT: v_mov_b32_e32 v1, s11 525; GFX7-NEXT: s_ashr_i32 s5, s5, 24 526; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 527; GFX7-NEXT: s_ashr_i32 s4, s4, 24 528; GFX7-NEXT: v_mov_b32_e32 v1, s5 529; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 530; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 531; GFX7-NEXT: s_endpgm 532; 533; GFX8-LABEL: idot4_multiuse_mul1: 534; GFX8: ; %bb.0: ; %entry 535; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 536; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 537; GFX8-NEXT: s_waitcnt lgkmcnt(0) 538; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 539; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 540; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 541; GFX8-NEXT: s_waitcnt lgkmcnt(0) 542; GFX8-NEXT: s_sext_i32_i8 s4, s2 543; GFX8-NEXT: s_sext_i32_i8 s5, s3 544; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80008 545; GFX8-NEXT: v_mov_b32_e32 v0, s5 546; GFX8-NEXT: v_mov_b32_e32 v1, s10 547; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80008 548; GFX8-NEXT: v_mad_i32_i24 v1, s4, v0, v1 549; GFX8-NEXT: v_mov_b32_e32 v2, s7 550; GFX8-NEXT: s_bfe_i32 s9, s3, 0x80010 551; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v1 552; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 553; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 554; GFX8-NEXT: v_mov_b32_e32 v1, s9 555; GFX8-NEXT: s_ashr_i32 s3, s3, 24 556; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 557; GFX8-NEXT: s_ashr_i32 s2, s2, 24 558; GFX8-NEXT: v_mov_b32_e32 v1, s3 559; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 560; GFX8-NEXT: v_mov_b32_e32 v0, s0 561; GFX8-NEXT: v_mov_b32_e32 v1, s1 562; GFX8-NEXT: flat_store_dword v[0:1], v2 563; GFX8-NEXT: s_endpgm 564; 565; GFX9-NODL-LABEL: idot4_multiuse_mul1: 566; GFX9-NODL: ; %bb.0: ; %entry 567; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 568; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 569; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 570; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 571; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 572; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 573; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 574; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 575; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 576; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 577; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 578; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 579; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 580; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 581; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 582; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 583; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 584; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 585; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 586; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 587; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 588; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 589; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 590; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 591; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 592; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 593; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 594; GFX9-NODL-NEXT: s_endpgm 595; 596; GFX9-DL-LABEL: idot4_multiuse_mul1: 597; GFX9-DL: ; %bb.0: ; %entry 598; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 599; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 600; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 601; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 602; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 603; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 604; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 605; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 606; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 607; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 608; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 609; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 610; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 611; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008 612; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 613; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 614; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010 615; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 616; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 617; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 618; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 619; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 620; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 621; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 622; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 623; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 624; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 625; GFX9-DL-NEXT: s_endpgm 626; 627; GFX10-DL-LABEL: idot4_multiuse_mul1: 628; GFX10-DL: ; %bb.0: ; %entry 629; GFX10-DL-NEXT: s_clause 0x1 630; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 631; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 632; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 633; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 634; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 635; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 636; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 637; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 638; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 639; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 640; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 641; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 642; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 643; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 644; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 645; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 646; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80010 647; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80010 648; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 649; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 650; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 651; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s1, v0 652; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] 653; GFX10-DL-NEXT: s_endpgm 654 <4 x i8> addrspace(1)* %src2, 655 i32 addrspace(1)* nocapture %dst) { 656entry: 657 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 658 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 659 660 %v1e0 = extractelement <4 x i8> %vec1, i64 0 661 %cv1e0 = sext i8 %v1e0 to i32 662 %v2e0 = extractelement <4 x i8> %vec2, i64 0 663 %cv2e0 = sext i8 %v2e0 to i32 664 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 665 666 %v1e1 = extractelement <4 x i8> %vec1, i64 1 667 %cv1e1 = sext i8 %v1e1 to i32 668 %v2e1 = extractelement <4 x i8> %vec2, i64 1 669 %cv2e1 = sext i8 %v2e1 to i32 670 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 671 672 %v1e2 = extractelement <4 x i8> %vec1, i64 2 673 %cv1e2 = sext i8 %v1e2 to i32 674 %v2e2 = extractelement <4 x i8> %vec2, i64 2 675 %cv2e2 = sext i8 %v2e2 to i32 676 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 677 678 %v1e3 = extractelement <4 x i8> %vec1, i64 3 679 %cv1e3 = sext i8 %v1e3 to i32 680 %v2e3 = extractelement <4 x i8> %vec2, i64 3 681 %cv2e3 = sext i8 %v2e3 to i32 682 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 683 684 %acc = load i32, i32 addrspace(1)* %dst, align 4 685 %add = add i32 %mul1, %acc 686 %add1 = add i32 %mul2, %add 687 %add2 = add i32 %add1, %mul1 688 %add3 = add i32 %add2, %mul3 689 %add4 = add i32 %add3, %mul4 690 691 store i32 %add4, i32 addrspace(1)* %dst, align 4 692 ret void 693} 694 695; TODO: Support this pattern. 696define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, 697; GFX7-LABEL: idot4_acc32_vecMul: 698; GFX7: ; %bb.0: ; %entry 699; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 700; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 701; GFX7-NEXT: s_mov_b32 s3, 0xf000 702; GFX7-NEXT: s_mov_b32 s2, -1 703; GFX7-NEXT: s_waitcnt lgkmcnt(0) 704; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 705; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 706; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 707; GFX7-NEXT: s_waitcnt lgkmcnt(0) 708; GFX7-NEXT: s_ashr_i32 s6, s4, 24 709; GFX7-NEXT: s_ashr_i32 s9, s5, 24 710; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80010 711; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008 712; GFX7-NEXT: s_sext_i32_i8 s5, s5 713; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80010 714; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 715; GFX7-NEXT: s_sext_i32_i8 s4, s4 716; GFX7-NEXT: v_mov_b32_e32 v0, s5 717; GFX7-NEXT: v_mov_b32_e32 v1, s12 718; GFX7-NEXT: v_mad_i32_i24 v0, s4, v0, v1 719; GFX7-NEXT: v_mov_b32_e32 v1, s11 720; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 721; GFX7-NEXT: v_mov_b32_e32 v1, s10 722; GFX7-NEXT: v_mad_i32_i24 v0, s7, v1, v0 723; GFX7-NEXT: v_mov_b32_e32 v1, s9 724; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 725; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 726; GFX7-NEXT: s_endpgm 727; 728; GFX8-LABEL: idot4_acc32_vecMul: 729; GFX8: ; %bb.0: ; %entry 730; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 731; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 732; GFX8-NEXT: s_waitcnt lgkmcnt(0) 733; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 734; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 735; GFX8-NEXT: s_load_dword s8, s[0:1], 0x0 736; GFX8-NEXT: s_waitcnt lgkmcnt(0) 737; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s2 738; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s3 739; GFX8-NEXT: s_ashr_i32 s6, s3, 24 740; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80010 741; GFX8-NEXT: s_sext_i32_i8 s3, s3 742; GFX8-NEXT: s_ashr_i32 s4, s2, 24 743; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010 744; GFX8-NEXT: s_sext_i32_i8 s2, s2 745; GFX8-NEXT: v_mov_b32_e32 v2, s3 746; GFX8-NEXT: v_mov_b32_e32 v3, s8 747; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 748; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 749; GFX8-NEXT: v_mad_i32_i24 v2, s2, v2, v3 750; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, v2 751; GFX8-NEXT: v_mov_b32_e32 v1, s7 752; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 753; GFX8-NEXT: v_mov_b32_e32 v1, s6 754; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 755; GFX8-NEXT: v_mov_b32_e32 v0, s0 756; GFX8-NEXT: v_mov_b32_e32 v1, s1 757; GFX8-NEXT: flat_store_dword v[0:1], v2 758; GFX8-NEXT: s_endpgm 759; 760; GFX9-NODL-LABEL: idot4_acc32_vecMul: 761; GFX9-NODL: ; %bb.0: ; %entry 762; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 763; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 764; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 765; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 766; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 767; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 768; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0 769; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 770; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 771; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 772; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24 773; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010 774; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 775; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24 776; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 777; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 778; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 779; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 780; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 781; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 782; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s2, v3, v4 783; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, v3 784; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 785; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 786; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 787; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 788; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 789; GFX9-NODL-NEXT: s_endpgm 790; 791; GFX9-DL-LABEL: idot4_acc32_vecMul: 792; GFX9-DL: ; %bb.0: ; %entry 793; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 794; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 795; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 796; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 797; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 798; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 799; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0 800; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 801; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 802; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 803; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24 804; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 805; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 806; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24 807; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010 808; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 809; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 810; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 811; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 812; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 813; GFX9-DL-NEXT: v_mad_i32_i24 v3, s2, v3, v4 814; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, v3 815; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 816; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 817; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 818; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 819; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 820; GFX9-DL-NEXT: s_endpgm 821; 822; GFX10-DL-LABEL: idot4_acc32_vecMul: 823; GFX10-DL: ; %bb.0: ; %entry 824; GFX10-DL-NEXT: s_clause 0x1 825; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 826; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 827; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 828; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 829; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 830; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 831; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 832; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 833; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 834; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 835; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2 836; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3 837; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 838; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 839; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 840; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 841; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 842; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 843; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 844; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 845; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 846; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 847; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 848; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] 849; GFX10-DL-NEXT: s_endpgm 850 <4 x i8> addrspace(1)* %src2, 851 i32 addrspace(1)* nocapture %dst) { 852entry: 853 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 854 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 855 856 %cvec1 = sext <4 x i8> %vec1 to <4 x i32> 857 %cvec2 = sext <4 x i8> %vec2 to <4 x i32> 858 859 %mul = mul <4 x i32> %cvec1, %cvec2 860 %mul0 = extractelement <4 x i32> %mul, i64 0 861 %mul1 = extractelement <4 x i32> %mul, i64 1 862 %mul2 = extractelement <4 x i32> %mul, i64 2 863 %mul3 = extractelement <4 x i32> %mul, i64 3 864 865 %acc = load i32, i32 addrspace(1)* %dst, align 4 866 %add1 = add i32 %mul0, %acc 867 %add2 = add i32 %add1, %mul1 868 %add3 = add i32 %add2, %mul2 869 %add4 = add i32 %add3, %mul3 870 871 store i32 %add4, i32 addrspace(1)* %dst, align 4 872 ret void 873} 874 875define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, 876; GFX7-LABEL: idot4_acc16_vecMul: 877; GFX7: ; %bb.0: ; %entry 878; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 879; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 880; GFX7-NEXT: s_mov_b32 s3, 0xf000 881; GFX7-NEXT: s_mov_b32 s2, -1 882; GFX7-NEXT: s_waitcnt lgkmcnt(0) 883; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 884; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 885; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 886; GFX7-NEXT: s_waitcnt lgkmcnt(0) 887; GFX7-NEXT: s_ashr_i32 s6, s4, 24 888; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80010 889; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008 890; GFX7-NEXT: s_ashr_i32 s9, s5, 24 891; GFX7-NEXT: s_sext_i32_i8 s5, s5 892; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80010 893; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 894; GFX7-NEXT: s_sext_i32_i8 s4, s4 895; GFX7-NEXT: v_mov_b32_e32 v1, s5 896; GFX7-NEXT: v_mov_b32_e32 v2, s11 897; GFX7-NEXT: v_mov_b32_e32 v3, s10 898; GFX7-NEXT: s_waitcnt vmcnt(0) 899; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 900; GFX7-NEXT: v_mad_i32_i24 v0, s8, v2, v0 901; GFX7-NEXT: v_mad_i32_i24 v0, s7, v3, v0 902; GFX7-NEXT: v_mov_b32_e32 v1, s9 903; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 904; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 905; GFX7-NEXT: s_endpgm 906; 907; GFX8-LABEL: idot4_acc16_vecMul: 908; GFX8: ; %bb.0: ; %entry 909; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 910; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 911; GFX8-NEXT: s_waitcnt lgkmcnt(0) 912; GFX8-NEXT: v_mov_b32_e32 v0, s0 913; GFX8-NEXT: v_mov_b32_e32 v1, s1 914; GFX8-NEXT: flat_load_ushort v2, v[0:1] 915; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 916; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 917; GFX8-NEXT: s_waitcnt lgkmcnt(0) 918; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s0 919; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1 920; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80010 921; GFX8-NEXT: s_ashr_i32 s4, s1, 24 922; GFX8-NEXT: s_sext_i32_i8 s1, s1 923; GFX8-NEXT: s_ashr_i32 s2, s0, 24 924; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010 925; GFX8-NEXT: s_sext_i32_i8 s0, s0 926; GFX8-NEXT: v_mov_b32_e32 v5, s1 927; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 928; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 929; GFX8-NEXT: v_mov_b32_e32 v6, s5 930; GFX8-NEXT: s_waitcnt vmcnt(0) 931; GFX8-NEXT: v_mad_i32_i24 v2, s0, v5, v2 932; GFX8-NEXT: v_mad_i32_i24 v2, v3, v4, v2 933; GFX8-NEXT: v_mad_i32_i24 v2, s3, v6, v2 934; GFX8-NEXT: v_mov_b32_e32 v3, s4 935; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 936; GFX8-NEXT: flat_store_short v[0:1], v2 937; GFX8-NEXT: s_endpgm 938; 939; GFX9-NODL-LABEL: idot4_acc16_vecMul: 940; GFX9-NODL: ; %bb.0: ; %entry 941; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 942; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 943; GFX9-NODL-NEXT: v_mov_b32_e32 v5, 0xffff 944; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 945; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 946; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 947; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 948; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 949; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 950; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16 951; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v4, 8, s5 952; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000 953; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 954; GFX9-NODL-NEXT: v_and_b32_e32 v6, s5, v5 955; GFX9-NODL-NEXT: s_bfe_i32 s4, s4, 0x80000 956; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 957; GFX9-NODL-NEXT: v_and_b32_e32 v6, s4, v5 958; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 959; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 960; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000 961; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s2 962; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 963; GFX9-NODL-NEXT: v_and_b32_e32 v4, s3, v5 964; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000 965; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 966; GFX9-NODL-NEXT: v_and_b32_e32 v4, s2, v5 967; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 968; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 969; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1] 970; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 971; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2 972; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 973; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3 974; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 975; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] 976; GFX9-NODL-NEXT: s_endpgm 977; 978; GFX9-DL-LABEL: idot4_acc16_vecMul: 979; GFX9-DL: ; %bb.0: ; %entry 980; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 981; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 982; GFX9-DL-NEXT: v_mov_b32_e32 v5, 0xffff 983; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 984; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 985; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 986; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 987; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 988; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 989; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16 990; GFX9-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s5 991; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000 992; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 993; GFX9-DL-NEXT: v_and_b32_e32 v6, s5, v5 994; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80000 995; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 996; GFX9-DL-NEXT: v_and_b32_e32 v6, s4, v5 997; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 998; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 999; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000 1000; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s2 1001; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 1002; GFX9-DL-NEXT: v_and_b32_e32 v4, s3, v5 1003; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 1004; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 1005; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v5 1006; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 1007; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 1008; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1] 1009; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1010; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2 1011; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1012; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 1013; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1014; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] 1015; GFX9-DL-NEXT: s_endpgm 1016; 1017; GFX10-DL-LABEL: idot4_acc16_vecMul: 1018; GFX10-DL: ; %bb.0: ; %entry 1019; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1020; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1021; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1022; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff 1023; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] 1025; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 1026; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 1027; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 1029; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s0 1030; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 1031; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 1032; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v2 1033; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s1 1034; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v2 1035; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 1036; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s2 1037; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 1038; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x80000 1039; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 1040; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80000 1041; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s0 1042; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v2 1043; GFX10-DL-NEXT: v_and_b32_e32 v2, s1, v2 1044; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 1045; GFX10-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v6 1046; GFX10-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 1047; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 1048; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1049; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1 1050; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1051; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 1052; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1053; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] 1054; GFX10-DL-NEXT: s_endpgm 1055 <4 x i8> addrspace(1)* %src2, 1056 i16 addrspace(1)* nocapture %dst) { 1057entry: 1058 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1 1059 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2 1060 1061 %cvec1 = sext <4 x i8> %vec1 to <4 x i16> 1062 %cvec2 = sext <4 x i8> %vec2 to <4 x i16> 1063 1064 %mul = mul <4 x i16> %cvec1, %cvec2 1065 %mul0 = extractelement <4 x i16> %mul, i64 0 1066 %mul1 = extractelement <4 x i16> %mul, i64 1 1067 %mul2 = extractelement <4 x i16> %mul, i64 2 1068 %mul3 = extractelement <4 x i16> %mul, i64 3 1069 1070 %acc = load i16, i16 addrspace(1)* %dst, align 4 1071 %add1 = add i16 %mul0, %acc 1072 %add2 = add i16 %add1, %mul1 1073 %add3 = add i16 %add2, %mul2 1074 %add4 = add i16 %add3, %mul3 1075 1076 store i16 %add4, i16 addrspace(1)* %dst, align 4 1077 ret void 1078} 1079