1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i1 @llvm.amdgcn.wqm.vote(i1) 9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) 10declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) 11 12; Show what the atomic optimization pass will do for raw buffers. 13 14define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) { 15; GFX7-LABEL: add_i32_constant: 16; GFX7: ; %bb.0: ; %entry 17; GFX7-NEXT: s_mov_b64 s[10:11], exec 18; GFX7-NEXT: ; implicit-def: $vgpr0 19; GFX7-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 20; GFX7-NEXT: s_cbranch_execz BB0_4 21; GFX7-NEXT: ; %bb.1: 22; GFX7-NEXT: s_mov_b64 s[12:13], exec 23; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 24; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s13, v0 25; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 26; GFX7-NEXT: ; implicit-def: $vgpr1 27; GFX7-NEXT: s_and_saveexec_b64 s[10:11], vcc 28; GFX7-NEXT: s_cbranch_execz BB0_3 29; GFX7-NEXT: ; %bb.2: 30; GFX7-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 31; GFX7-NEXT: v_mul_u32_u24_e64 v1, s12, 5 32; GFX7-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 33; GFX7-NEXT: BB0_3: 34; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] 35; GFX7-NEXT: s_waitcnt vmcnt(0) 36; GFX7-NEXT: v_readfirstlane_b32 s4, v1 37; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 38; GFX7-NEXT: BB0_4: ; %Flow 39; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] 40; GFX7-NEXT: s_wqm_b64 s[4:5], -1 41; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] 42; GFX7-NEXT: s_cbranch_vccnz BB0_6 43; GFX7-NEXT: ; %bb.5: ; %if 44; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7-NEXT: BB0_6: ; %UnifiedReturnBlock 46; GFX7-NEXT: s_endpgm 47; 48; GFX8-LABEL: add_i32_constant: 49; GFX8: ; %bb.0: ; %entry 50; GFX8-NEXT: s_mov_b64 s[10:11], exec 51; GFX8-NEXT: ; implicit-def: $vgpr0 52; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 53; GFX8-NEXT: s_cbranch_execz BB0_4 54; GFX8-NEXT: ; %bb.1: 55; GFX8-NEXT: s_mov_b64 s[12:13], exec 56; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 57; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 58; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 59; GFX8-NEXT: ; implicit-def: $vgpr1 60; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc 61; GFX8-NEXT: s_cbranch_execz BB0_3 62; GFX8-NEXT: ; %bb.2: 63; GFX8-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 64; GFX8-NEXT: v_mul_u32_u24_e64 v1, s12, 5 65; GFX8-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 66; GFX8-NEXT: BB0_3: 67; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] 68; GFX8-NEXT: s_waitcnt vmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s4, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 71; GFX8-NEXT: BB0_4: ; %Flow 72; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 73; GFX8-NEXT: s_wqm_b64 s[4:5], -1 74; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] 75; GFX8-NEXT: s_cbranch_vccnz BB0_6 76; GFX8-NEXT: ; %bb.5: ; %if 77; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 78; GFX8-NEXT: BB0_6: ; %UnifiedReturnBlock 79; GFX8-NEXT: s_endpgm 80; 81; GFX9-LABEL: add_i32_constant: 82; GFX9: ; %bb.0: ; %entry 83; GFX9-NEXT: s_mov_b64 s[10:11], exec 84; GFX9-NEXT: ; implicit-def: $vgpr0 85; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 86; GFX9-NEXT: s_cbranch_execz BB0_4 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_mov_b64 s[12:13], exec 89; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 90; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 91; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 92; GFX9-NEXT: ; implicit-def: $vgpr1 93; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc 94; GFX9-NEXT: s_cbranch_execz BB0_3 95; GFX9-NEXT: ; %bb.2: 96; GFX9-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 97; GFX9-NEXT: v_mul_u32_u24_e64 v1, s12, 5 98; GFX9-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 99; GFX9-NEXT: BB0_3: 100; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] 101; GFX9-NEXT: s_waitcnt vmcnt(0) 102; GFX9-NEXT: v_readfirstlane_b32 s4, v1 103; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 104; GFX9-NEXT: BB0_4: ; %Flow 105; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] 106; GFX9-NEXT: s_wqm_b64 s[4:5], -1 107; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 108; GFX9-NEXT: s_cbranch_vccnz BB0_6 109; GFX9-NEXT: ; %bb.5: ; %if 110; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 111; GFX9-NEXT: BB0_6: ; %UnifiedReturnBlock 112; GFX9-NEXT: s_endpgm 113; 114; GFX1064-LABEL: add_i32_constant: 115; GFX1064: ; %bb.0: ; %entry 116; GFX1064-NEXT: s_mov_b64 s[10:11], exec 117; GFX1064-NEXT: ; implicit-def: $vgpr0 118; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 119; GFX1064-NEXT: s_cbranch_execz BB0_4 120; GFX1064-NEXT: ; %bb.1: 121; GFX1064-NEXT: s_mov_b64 s[12:13], exec 122; GFX1064-NEXT: ; implicit-def: $vgpr1 123; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 124; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s13, v0 125; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 126; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc 127; GFX1064-NEXT: s_cbranch_execz BB0_3 128; GFX1064-NEXT: ; %bb.2: 129; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 130; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s12, 5 131; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 132; GFX1064-NEXT: BB0_3: 133; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 134; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] 135; GFX1064-NEXT: s_waitcnt vmcnt(0) 136; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 137; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 138; GFX1064-NEXT: BB0_4: ; %Flow 139; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 140; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 141; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] 142; GFX1064-NEXT: s_cbranch_vccnz BB0_6 143; GFX1064-NEXT: ; %bb.5: ; %if 144; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 145; GFX1064-NEXT: BB0_6: ; %UnifiedReturnBlock 146; GFX1064-NEXT: s_endpgm 147; 148; GFX1032-LABEL: add_i32_constant: 149; GFX1032: ; %bb.0: ; %entry 150; GFX1032-NEXT: s_mov_b32 s9, exec_lo 151; GFX1032-NEXT: ; implicit-def: $vgpr0 152; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 153; GFX1032-NEXT: s_cbranch_execz BB0_4 154; GFX1032-NEXT: ; %bb.1: 155; GFX1032-NEXT: s_mov_b32 s10, exec_lo 156; GFX1032-NEXT: ; implicit-def: $vgpr1 157; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0 158; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 159; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo 160; GFX1032-NEXT: s_cbranch_execz BB0_3 161; GFX1032-NEXT: ; %bb.2: 162; GFX1032-NEXT: s_bcnt1_i32_b32 s10, s10 163; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s10, 5 164; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 165; GFX1032-NEXT: BB0_3: 166; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 167; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 168; GFX1032-NEXT: s_waitcnt vmcnt(0) 169; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 170; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4 171; GFX1032-NEXT: BB0_4: ; %Flow 172; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 173; GFX1032-NEXT: s_wqm_b32 s4, -1 174; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 175; GFX1032-NEXT: s_cbranch_vccnz BB0_6 176; GFX1032-NEXT: ; %bb.5: ; %if 177; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 178; GFX1032-NEXT: BB0_6: ; %UnifiedReturnBlock 179; GFX1032-NEXT: s_endpgm 180entry: 181 %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 182 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 183 %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 184 %cond = and i1 %cond1, %cond2 185 br i1 %cond, label %if, label %else 186if: 187 %bitcast = bitcast i32 %old to float 188 call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) 189 ret void 190else: 191 ret void 192} 193 194define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) { 195; GFX7-LABEL: add_i32_varying: 196; GFX7: ; %bb.0: ; %entry 197; GFX7-NEXT: s_wqm_b64 s[8:9], -1 198; GFX7-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 199; GFX7-NEXT: s_andn2_b64 vcc, exec, s[8:9] 200; GFX7-NEXT: s_cbranch_vccnz BB1_2 201; GFX7-NEXT: ; %bb.1: ; %if 202; GFX7-NEXT: s_waitcnt vmcnt(0) 203; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 204; GFX7-NEXT: BB1_2: ; %else 205; GFX7-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_varying: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_mov_b64 s[10:11], exec 210; GFX8-NEXT: v_mov_b32_e32 v2, v0 211; GFX8-NEXT: ; implicit-def: $vgpr0 212; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 213; GFX8-NEXT: s_cbranch_execz BB1_4 214; GFX8-NEXT: ; %bb.1: 215; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 216; GFX8-NEXT: v_mov_b32_e32 v1, 0 217; GFX8-NEXT: s_mov_b64 exec, s[10:11] 218; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 219; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 220; GFX8-NEXT: s_not_b64 exec, exec 221; GFX8-NEXT: v_mov_b32_e32 v2, 0 222; GFX8-NEXT: s_not_b64 exec, exec 223; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 224; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 225; GFX8-NEXT: s_nop 1 226; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 227; GFX8-NEXT: s_nop 1 228; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 229; GFX8-NEXT: s_nop 1 230; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 231; GFX8-NEXT: s_nop 1 232; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 233; GFX8-NEXT: s_nop 1 234; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 235; GFX8-NEXT: v_readlane_b32 s12, v2, 63 236; GFX8-NEXT: s_nop 0 237; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 238; GFX8-NEXT: s_mov_b64 exec, s[10:11] 239; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 240; GFX8-NEXT: ; implicit-def: $vgpr0 241; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc 242; GFX8-NEXT: s_cbranch_execz BB1_3 243; GFX8-NEXT: ; %bb.2: 244; GFX8-NEXT: v_mov_b32_e32 v0, s12 245; GFX8-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 246; GFX8-NEXT: BB1_3: 247; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] 248; GFX8-NEXT: s_waitcnt vmcnt(0) 249; GFX8-NEXT: v_readfirstlane_b32 s4, v0 250; GFX8-NEXT: v_mov_b32_e32 v0, v1 251; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 252; GFX8-NEXT: BB1_4: ; %Flow 253; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 254; GFX8-NEXT: s_wqm_b64 s[4:5], -1 255; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] 256; GFX8-NEXT: s_cbranch_vccnz BB1_6 257; GFX8-NEXT: ; %bb.5: ; %if 258; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 259; GFX8-NEXT: BB1_6: ; %UnifiedReturnBlock 260; GFX8-NEXT: s_endpgm 261; 262; GFX9-LABEL: add_i32_varying: 263; GFX9: ; %bb.0: ; %entry 264; GFX9-NEXT: s_mov_b64 s[10:11], exec 265; GFX9-NEXT: v_mov_b32_e32 v2, v0 266; GFX9-NEXT: ; implicit-def: $vgpr0 267; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 268; GFX9-NEXT: s_cbranch_execz BB1_4 269; GFX9-NEXT: ; %bb.1: 270; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 271; GFX9-NEXT: v_mov_b32_e32 v1, 0 272; GFX9-NEXT: s_mov_b64 exec, s[10:11] 273; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 274; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 275; GFX9-NEXT: s_not_b64 exec, exec 276; GFX9-NEXT: v_mov_b32_e32 v2, 0 277; GFX9-NEXT: s_not_b64 exec, exec 278; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 279; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 280; GFX9-NEXT: s_nop 1 281; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 282; GFX9-NEXT: s_nop 1 283; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 284; GFX9-NEXT: s_nop 1 285; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 286; GFX9-NEXT: s_nop 1 287; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 288; GFX9-NEXT: s_nop 1 289; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 290; GFX9-NEXT: v_readlane_b32 s12, v2, 63 291; GFX9-NEXT: s_nop 0 292; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 293; GFX9-NEXT: s_mov_b64 exec, s[10:11] 294; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 295; GFX9-NEXT: ; implicit-def: $vgpr0 296; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc 297; GFX9-NEXT: s_cbranch_execz BB1_3 298; GFX9-NEXT: ; %bb.2: 299; GFX9-NEXT: v_mov_b32_e32 v0, s12 300; GFX9-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 301; GFX9-NEXT: BB1_3: 302; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] 303; GFX9-NEXT: s_waitcnt vmcnt(0) 304; GFX9-NEXT: v_readfirstlane_b32 s4, v0 305; GFX9-NEXT: v_mov_b32_e32 v0, v1 306; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 307; GFX9-NEXT: BB1_4: ; %Flow 308; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] 309; GFX9-NEXT: s_wqm_b64 s[4:5], -1 310; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 311; GFX9-NEXT: s_cbranch_vccnz BB1_6 312; GFX9-NEXT: ; %bb.5: ; %if 313; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 314; GFX9-NEXT: BB1_6: ; %UnifiedReturnBlock 315; GFX9-NEXT: s_endpgm 316; 317; GFX1064-LABEL: add_i32_varying: 318; GFX1064: ; %bb.0: ; %entry 319; GFX1064-NEXT: s_mov_b64 s[10:11], exec 320; GFX1064-NEXT: v_mov_b32_e32 v1, v0 321; GFX1064-NEXT: ; implicit-def: $vgpr0 322; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 323; GFX1064-NEXT: s_cbranch_execz BB1_4 324; GFX1064-NEXT: ; %bb.1: 325; GFX1064-NEXT: s_not_b64 exec, exec 326; GFX1064-NEXT: v_mov_b32_e32 v1, 0 327; GFX1064-NEXT: s_not_b64 exec, exec 328; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 329; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 330; GFX1064-NEXT: v_mov_b32_e32 v3, 0 331; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 332; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 333; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 334; GFX1064-NEXT: v_mov_b32_e32 v2, v1 335; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 336; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 337; GFX1064-NEXT: v_readlane_b32 s12, v1, 31 338; GFX1064-NEXT: v_mov_b32_e32 v2, s12 339; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 340; GFX1064-NEXT: v_readlane_b32 s12, v1, 15 341; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 342; GFX1064-NEXT: v_readlane_b32 s13, v1, 31 343; GFX1064-NEXT: v_writelane_b32 v3, s12, 16 344; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 345; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 346; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 347; GFX1064-NEXT: v_readlane_b32 s12, v1, 63 348; GFX1064-NEXT: v_readlane_b32 s14, v1, 47 349; GFX1064-NEXT: v_writelane_b32 v3, s13, 32 350; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 351; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 352; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 353; GFX1064-NEXT: v_writelane_b32 v3, s14, 48 354; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 355; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 356; GFX1064-NEXT: ; implicit-def: $vgpr0 357; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc 358; GFX1064-NEXT: s_cbranch_execz BB1_3 359; GFX1064-NEXT: ; %bb.2: 360; GFX1064-NEXT: v_mov_b32_e32 v0, s12 361; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 362; GFX1064-NEXT: BB1_3: 363; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 364; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] 365; GFX1064-NEXT: s_waitcnt vmcnt(0) 366; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 367; GFX1064-NEXT: v_mov_b32_e32 v0, v3 368; GFX1064-NEXT: v_add_nc_u32_e32 v0, s4, v0 369; GFX1064-NEXT: BB1_4: ; %Flow 370; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 371; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 372; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] 373; GFX1064-NEXT: s_cbranch_vccnz BB1_6 374; GFX1064-NEXT: ; %bb.5: ; %if 375; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 376; GFX1064-NEXT: BB1_6: ; %UnifiedReturnBlock 377; GFX1064-NEXT: s_endpgm 378; 379; GFX1032-LABEL: add_i32_varying: 380; GFX1032: ; %bb.0: ; %entry 381; GFX1032-NEXT: s_mov_b32 s9, exec_lo 382; GFX1032-NEXT: v_mov_b32_e32 v1, v0 383; GFX1032-NEXT: ; implicit-def: $vgpr0 384; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 385; GFX1032-NEXT: s_cbranch_execz BB1_4 386; GFX1032-NEXT: ; %bb.1: 387; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 388; GFX1032-NEXT: v_mov_b32_e32 v1, 0 389; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 390; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 391; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 392; GFX1032-NEXT: v_mov_b32_e32 v3, 0 393; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 394; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 395; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 396; GFX1032-NEXT: v_mov_b32_e32 v2, v1 397; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 398; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 399; GFX1032-NEXT: v_readlane_b32 s11, v1, 31 400; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 401; GFX1032-NEXT: v_readlane_b32 s10, v1, 15 402; GFX1032-NEXT: s_mov_b32 exec_lo, s9 403; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 404; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 405; GFX1032-NEXT: v_writelane_b32 v3, s10, 16 406; GFX1032-NEXT: s_mov_b32 exec_lo, s9 407; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 408; GFX1032-NEXT: ; implicit-def: $vgpr0 409; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo 410; GFX1032-NEXT: s_cbranch_execz BB1_3 411; GFX1032-NEXT: ; %bb.2: 412; GFX1032-NEXT: v_mov_b32_e32 v0, s11 413; GFX1032-NEXT: s_mov_b32 s10, s11 414; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 415; GFX1032-NEXT: BB1_3: 416; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 417; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 418; GFX1032-NEXT: s_waitcnt vmcnt(0) 419; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 420; GFX1032-NEXT: v_mov_b32_e32 v0, v3 421; GFX1032-NEXT: v_add_nc_u32_e32 v0, s4, v0 422; GFX1032-NEXT: BB1_4: ; %Flow 423; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 424; GFX1032-NEXT: s_wqm_b32 s4, -1 425; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 426; GFX1032-NEXT: s_cbranch_vccnz BB1_6 427; GFX1032-NEXT: ; %bb.5: ; %if 428; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 429; GFX1032-NEXT: BB1_6: ; %UnifiedReturnBlock 430; GFX1032-NEXT: s_endpgm 431entry: 432 %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 433 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0) 434 %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 435 %cond = and i1 %cond1, %cond2 436 br i1 %cond, label %if, label %else 437if: 438 %bitcast = bitcast i32 %old to float 439 call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) 440 ret void 441else: 442 ret void 443} 444