1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s 3; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s 4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 6 7define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) { 8; GFX906-LABEL: v_sdot2: 9; GFX906: ; %bb.0: 10; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 12; GFX906-NEXT: s_setpc_b64 s[30:31] 13; 14; GFX908-LABEL: v_sdot2: 15; GFX908: ; %bb.0: 16; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 18; GFX908-NEXT: s_setpc_b64 s[30:31] 19; 20; GFX10-LABEL: v_sdot2: 21; GFX10: ; %bb.0: 22; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 24; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 25; GFX10-NEXT: s_setpc_b64 s[30:31] 26 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false) 27 ret i32 %r 28} 29 30define i32 @v_sdot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) { 31; GFX906-LABEL: v_sdot2_clamp: 32; GFX906: ; %bb.0: 33; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp 35; GFX906-NEXT: s_setpc_b64 s[30:31] 36; 37; GFX908-LABEL: v_sdot2_clamp: 38; GFX908: ; %bb.0: 39; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp 41; GFX908-NEXT: s_setpc_b64 s[30:31] 42; 43; GFX10-LABEL: v_sdot2_clamp: 44; GFX10: ; %bb.0: 45; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 46; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 47; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp 48; GFX10-NEXT: s_setpc_b64 s[30:31] 49 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true) 50 ret i32 %r 51} 52 53define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) { 54; GFX906-LABEL: v_sdot2_sgpr_sgpr_sgpr: 55; GFX906: ; %bb.0: 56; GFX906-NEXT: v_mov_b32_e32 v0, s1 57; GFX906-NEXT: v_mov_b32_e32 v1, s2 58; GFX906-NEXT: v_dot2_i32_i16 v0, s0, v0, v1 59; GFX906-NEXT: ; return to shader part epilog 60; 61; GFX908-LABEL: v_sdot2_sgpr_sgpr_sgpr: 62; GFX908: ; %bb.0: 63; GFX908-NEXT: v_mov_b32_e32 v0, s1 64; GFX908-NEXT: v_mov_b32_e32 v1, s2 65; GFX908-NEXT: v_dot2_i32_i16 v0, s0, v0, v1 66; GFX908-NEXT: ; return to shader part epilog 67; 68; GFX10-LABEL: v_sdot2_sgpr_sgpr_sgpr: 69; GFX10: ; %bb.0: 70; GFX10-NEXT: v_mov_b32_e32 v0, s2 71; GFX10-NEXT: v_dot2_i32_i16 v0, s0, s1, v0 72; GFX10-NEXT: ; return to shader part epilog 73 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false) 74 %cast = bitcast i32 %r to float 75 ret float %cast 76} 77 78define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) { 79; GFX906-LABEL: v_sdot2_inline_literal_a: 80; GFX906: ; %bb.0: 81; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 82; GFX906-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] 83; GFX906-NEXT: s_setpc_b64 s[30:31] 84; 85; GFX908-LABEL: v_sdot2_inline_literal_a: 86; GFX908: ; %bb.0: 87; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 88; GFX908-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] 89; GFX908-NEXT: s_setpc_b64 s[30:31] 90; 91; GFX10-LABEL: v_sdot2_inline_literal_a: 92; GFX10: ; %bb.0: 93; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 94; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 95; GFX10-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] 96; GFX10-NEXT: s_setpc_b64 s[30:31] 97 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false) 98 ret i32 %r 99} 100 101define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) { 102; GFX906-LABEL: v_sdot2_inline_literal_b: 103; GFX906: ; %bb.0: 104; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; GFX906-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] 106; GFX906-NEXT: s_setpc_b64 s[30:31] 107; 108; GFX908-LABEL: v_sdot2_inline_literal_b: 109; GFX908: ; %bb.0: 110; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 111; GFX908-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] 112; GFX908-NEXT: s_setpc_b64 s[30:31] 113; 114; GFX10-LABEL: v_sdot2_inline_literal_b: 115; GFX10: ; %bb.0: 116; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 117; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 118; GFX10-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] 119; GFX10-NEXT: s_setpc_b64 s[30:31] 120 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false) 121 ret i32 %r 122} 123 124define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { 125; GFX906-LABEL: v_sdot2_inline_literal_a_b: 126; GFX906: ; %bb.0: 127; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 128; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] 129; GFX906-NEXT: s_setpc_b64 s[30:31] 130; 131; GFX908-LABEL: v_sdot2_inline_literal_a_b: 132; GFX908: ; %bb.0: 133; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 134; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] 135; GFX908-NEXT: s_setpc_b64 s[30:31] 136; 137; GFX10-LABEL: v_sdot2_inline_literal_a_b: 138; GFX10: ; %bb.0: 139; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 140; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 141; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] 142; GFX10-NEXT: s_setpc_b64 s[30:31] 143 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false) 144 ret i32 %r 145} 146 147define i32 @v_sdot2_inline_literal_a_b_c() { 148; GFX906-LABEL: v_sdot2_inline_literal_a_b_c: 149; GFX906: ; %bb.0: 150; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 151; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] 152; GFX906-NEXT: s_setpc_b64 s[30:31] 153; 154; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: 155; GFX908: ; %bb.0: 156; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 157; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] 158; GFX908-NEXT: s_setpc_b64 s[30:31] 159; 160; GFX10-LABEL: v_sdot2_inline_literal_a_b_c: 161; GFX10: ; %bb.0: 162; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 163; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 164; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] 165; GFX10-NEXT: s_setpc_b64 s[30:31] 166 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false) 167 ret i32 %r 168} 169 170define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) { 171; GFX906-LABEL: v_sdot2_inline_literal_c: 172; GFX906: ; %bb.0: 173; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 174; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, 7 175; GFX906-NEXT: s_setpc_b64 s[30:31] 176; 177; GFX908-LABEL: v_sdot2_inline_literal_c: 178; GFX908: ; %bb.0: 179; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, 7 181; GFX908-NEXT: s_setpc_b64 s[30:31] 182; 183; GFX10-LABEL: v_sdot2_inline_literal_c: 184; GFX10: ; %bb.0: 185; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 186; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 187; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, 7 188; GFX10-NEXT: s_setpc_b64 s[30:31] 189 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false) 190 ret i32 %r 191} 192 193define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) { 194; GFX906-LABEL: v_sdot2_fneg_a: 195; GFX906: ; %bb.0: 196; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 197; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 198; GFX906-NEXT: s_setpc_b64 s[30:31] 199; 200; GFX908-LABEL: v_sdot2_fneg_a: 201; GFX908: ; %bb.0: 202; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 203; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 204; GFX908-NEXT: s_setpc_b64 s[30:31] 205; 206; GFX10-LABEL: v_sdot2_fneg_a: 207; GFX10: ; %bb.0: 208; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 209; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 210; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 211; GFX10-NEXT: s_setpc_b64 s[30:31] 212 %neg.a = fneg <2 x half> %a 213 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> 214 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false) 215 ret i32 %r 216} 217 218define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) { 219; GFX906-LABEL: v_sdot2_fneg_b: 220; GFX906: ; %bb.0: 221; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 222; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 223; GFX906-NEXT: s_setpc_b64 s[30:31] 224; 225; GFX908-LABEL: v_sdot2_fneg_b: 226; GFX908: ; %bb.0: 227; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 228; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 229; GFX908-NEXT: s_setpc_b64 s[30:31] 230; 231; GFX10-LABEL: v_sdot2_fneg_b: 232; GFX10: ; %bb.0: 233; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 235; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 236; GFX10-NEXT: s_setpc_b64 s[30:31] 237 %neg.b = fneg <2 x half> %b 238 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> 239 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false) 240 ret i32 %r 241} 242 243define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { 244; GFX906-LABEL: v_sdot2_fnegf32_c: 245; GFX906: ; %bb.0: 246; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 247; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 248; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 249; GFX906-NEXT: s_setpc_b64 s[30:31] 250; 251; GFX908-LABEL: v_sdot2_fnegf32_c: 252; GFX908: ; %bb.0: 253; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 254; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 255; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 256; GFX908-NEXT: s_setpc_b64 s[30:31] 257; 258; GFX10-LABEL: v_sdot2_fnegf32_c: 259; GFX10: ; %bb.0: 260; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 261; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 262; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 263; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 264; GFX10-NEXT: s_setpc_b64 s[30:31] 265 %neg.c = fneg float %c 266 %cast.neg.c = bitcast float %neg.c to i32 267 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false) 268 ret i32 %r 269} 270 271define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { 272; GFX906-LABEL: v_sdot2_fnegv2f16_c: 273; GFX906: ; %bb.0: 274; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 275; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 276; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 277; GFX906-NEXT: s_setpc_b64 s[30:31] 278; 279; GFX908-LABEL: v_sdot2_fnegv2f16_c: 280; GFX908: ; %bb.0: 281; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 282; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 283; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 284; GFX908-NEXT: s_setpc_b64 s[30:31] 285; 286; GFX10-LABEL: v_sdot2_fnegv2f16_c: 287; GFX10: ; %bb.0: 288; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 289; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 291; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 292; GFX10-NEXT: s_setpc_b64 s[30:31] 293 %neg.c = fneg <2 x half> %c 294 %cast.neg.c = bitcast <2 x half> %neg.c to i32 295 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false) 296 ret i32 %r 297} 298 299define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { 300; GFX906-LABEL: v_sdot2_shuffle10_a: 301; GFX906: ; %bb.0: 302; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 303; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 304; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 305; GFX906-NEXT: s_setpc_b64 s[30:31] 306; 307; GFX908-LABEL: v_sdot2_shuffle10_a: 308; GFX908: ; %bb.0: 309; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 310; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 311; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 312; GFX908-NEXT: s_setpc_b64 s[30:31] 313; 314; GFX10-LABEL: v_sdot2_shuffle10_a: 315; GFX10: ; %bb.0: 316; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 317; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 318; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 319; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 320; GFX10-NEXT: s_setpc_b64 s[30:31] 321 %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0> 322 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) 323 ret i32 %r 324} 325 326define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { 327; GFX906-LABEL: v_sdot2_shuffle10_b: 328; GFX906: ; %bb.0: 329; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 330; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 331; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 332; GFX906-NEXT: s_setpc_b64 s[30:31] 333; 334; GFX908-LABEL: v_sdot2_shuffle10_b: 335; GFX908: ; %bb.0: 336; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 337; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 338; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 339; GFX908-NEXT: s_setpc_b64 s[30:31] 340; 341; GFX10-LABEL: v_sdot2_shuffle10_b: 342; GFX10: ; %bb.0: 343; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 345; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 346; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 347; GFX10-NEXT: s_setpc_b64 s[30:31] 348 %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0> 349 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) 350 ret i32 %r 351} 352 353declare i32 @llvm.amdgcn.sdot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0 354 355attributes #0 = { nounwind readnone speculatable } 356