1; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s 6 7declare i32 @llvm.amdgcn.workitem.id.x() 8 9; Show what the atomic optimization pass will do for global pointers. 10 11; GCN-LABEL: add_i32_constant: 12; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 13; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 14; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 15; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 16; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 17; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 18; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 19; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 20; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 21; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 22define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 23entry: 24 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 25 store i32 %old, i32 addrspace(1)* %out 26 ret void 27} 28 29; GCN-LABEL: add_i32_uniform: 30; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 31; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 32; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 33; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 34; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 35; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 36; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 37; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 38; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 39; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 40; GCN: {{flat|buffer|global}}_atomic_add v[[value]] 41define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 42entry: 43 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 44 store i32 %old, i32 addrspace(1)* %out 45 ret void 46} 47 48; GCN-LABEL: add_i32_varying: 49; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 50; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 51; GFX7LESS-NOT: s_bcnt1_i32_b64 52; GFX7LESS: buffer_atomic_add v{{[0-9]+}} 53; DPPCOMB: v_add_u32_dpp 54; DPPCOMB: v_add_u32_dpp 55; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 56; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 57; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 58; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] 59; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] 60; GFX8MORE: buffer_atomic_add v[[value]] 61define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 62entry: 63 %lane = call i32 @llvm.amdgcn.workitem.id.x() 64 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 65 store i32 %old, i32 addrspace(1)* %out 66 ret void 67} 68 69; GCN-LABEL: add_i64_constant: 70; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 71; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 72; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 73; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 74; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 75; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 76; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 77; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 78; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 79; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 80; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 81define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 82entry: 83 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 84 store i64 %old, i64 addrspace(1)* %out 85 ret void 86} 87 88; GCN-LABEL: add_i64_uniform: 89; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 90; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 91; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 92; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 93; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 94; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 95; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 96; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 97; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 98define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 99entry: 100 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 101 store i64 %old, i64 addrspace(1)* %out 102 ret void 103} 104 105; GCN-LABEL: add_i64_varying: 106; GCN-NOT: v_mbcnt_lo_u32_b32 107; GCN-NOT: v_mbcnt_hi_u32_b32 108; GCN-NOT: s_bcnt1_i32_b64 109; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 110define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 111entry: 112 %lane = call i32 @llvm.amdgcn.workitem.id.x() 113 %zext = zext i32 %lane to i64 114 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 115 store i64 %old, i64 addrspace(1)* %out 116 ret void 117} 118 119; GCN-LABEL: sub_i32_constant: 120; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 121; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 122; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 123; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 124; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 125; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 126; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 127; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 128; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 129; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 130define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 131entry: 132 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 133 store i32 %old, i32 addrspace(1)* %out 134 ret void 135} 136 137; GCN-LABEL: sub_i32_uniform: 138; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 139; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 140; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 141; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 142; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 143; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 144; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 145; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 146; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] 147; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 148; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] 149define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 150entry: 151 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 152 store i32 %old, i32 addrspace(1)* %out 153 ret void 154} 155 156; GCN-LABEL: sub_i32_varying: 157; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 158; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 159; GFX7LESS-NOT: s_bcnt1_i32_b64 160; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} 161; DPPCOMB: v_add_u32_dpp 162; DPPCOMB: v_add_u32_dpp 163; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 164; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 165; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 166; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] 167; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] 168; GFX8MORE: buffer_atomic_sub v[[value]] 169define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 170entry: 171 %lane = call i32 @llvm.amdgcn.workitem.id.x() 172 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 173 store i32 %old, i32 addrspace(1)* %out 174 ret void 175} 176 177; GCN-LABEL: sub_i64_constant: 178; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 179; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 180; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 181; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 182; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 183; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 184; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 185; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 186; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 187; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 188; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} 189define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 190entry: 191 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 192 store i64 %old, i64 addrspace(1)* %out 193 ret void 194} 195 196; GCN-LABEL: sub_i64_uniform: 197; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo 198; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec 199; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 200; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] 201; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] 202; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc 203; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] 204; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} 205; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 206define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 207entry: 208 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 209 store i64 %old, i64 addrspace(1)* %out 210 ret void 211} 212 213; GCN-LABEL: sub_i64_varying: 214; GCN-NOT: v_mbcnt_lo_u32_b32 215; GCN-NOT: v_mbcnt_hi_u32_b32 216; GCN-NOT: s_bcnt1_i32_b64 217; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} 218define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 219entry: 220 %lane = call i32 @llvm.amdgcn.workitem.id.x() 221 %zext = zext i32 %lane to i64 222 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 223 store i64 %old, i64 addrspace(1)* %out 224 ret void 225} 226