1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s 3; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s 4; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 5 6; Testing for ds_read/write_128 7; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=SI,FUNC %s 8; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s 9; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s 10 11; FUNC-LABEL: {{^}}local_load_i32: 12; GCN-NOT: s_wqm_b64 13; SICIVI: s_mov_b32 m0, -1 14; GFX9-NOT: m0 15; GCN: ds_read_b32 16 17; EG: LDS_READ_RET 18define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { 19entry: 20 %ld = load i32, i32 addrspace(3)* %in 21 store i32 %ld, i32 addrspace(3)* %out 22 ret void 23} 24 25; FUNC-LABEL: {{^}}local_load_v2i32: 26; SICIVI: s_mov_b32 m0, -1 27; GFX9-NOT: m0 28 29; GCN: ds_read_b64 30define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { 31entry: 32 %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in 33 store <2 x i32> %ld, <2 x i32> addrspace(3)* %out 34 ret void 35} 36 37; FUNC-LABEL: {{^}}local_load_v3i32: 38; SICIVI: s_mov_b32 m0, -1 39; GFX9-NOT: m0 40 41; GCN-DAG: ds_read_b64 42; GCN-DAG: ds_read_b32 43define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { 44entry: 45 %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in 46 store <3 x i32> %ld, <3 x i32> addrspace(3)* %out 47 ret void 48} 49 50; FUNC-LABEL: {{^}}local_load_v4i32: 51; SICIVI: s_mov_b32 m0, -1 52; GFX9-NOT: m0 53 54; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} 55 56define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { 57entry: 58 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in 59 store <4 x i32> %ld, <4 x i32> addrspace(3)* %out 60 ret void 61} 62 63; FUNC-LABEL: {{^}}local_load_v8i32: 64; SICIVI: s_mov_b32 m0, -1 65; GFX9-NOT: m0 66 67; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} 68; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} 69define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { 70entry: 71 %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in 72 store <8 x i32> %ld, <8 x i32> addrspace(3)* %out 73 ret void 74} 75 76; FUNC-LABEL: {{^}}local_load_v16i32: 77; SICIVI: s_mov_b32 m0, -1 78; GFX9-NOT: m0 79 80; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} 81; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} 82; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} 83; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} 84; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 85; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 86; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 87; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 88define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { 89entry: 90 %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in 91 store <16 x i32> %ld, <16 x i32> addrspace(3)* %out 92 ret void 93} 94 95; FUNC-LABEL: {{^}}local_zextload_i32_to_i64: 96; SICIVI: s_mov_b32 m0, -1 97; GFX9-NOT: m0 98 99define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { 100 %ld = load i32, i32 addrspace(3)* %in 101 %ext = zext i32 %ld to i64 102 store i64 %ext, i64 addrspace(3)* %out 103 ret void 104} 105 106; FUNC-LABEL: {{^}}local_sextload_i32_to_i64: 107; SICIVI: s_mov_b32 m0, -1 108; GFX9-NOT: m0 109 110define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { 111 %ld = load i32, i32 addrspace(3)* %in 112 %ext = sext i32 %ld to i64 113 store i64 %ext, i64 addrspace(3)* %out 114 ret void 115} 116 117; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64: 118; SICIVI: s_mov_b32 m0, -1 119; GFX9-NOT: m0 120 121define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { 122 %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in 123 %ext = zext <1 x i32> %ld to <1 x i64> 124 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out 125 ret void 126} 127 128; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64: 129; SICIVI: s_mov_b32 m0, -1 130; GFX9-NOT: m0 131 132define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { 133 %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in 134 %ext = sext <1 x i32> %ld to <1 x i64> 135 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out 136 ret void 137} 138 139; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64: 140; SICIVI: s_mov_b32 m0, -1 141; GFX9-NOT: m0 142 143define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { 144 %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in 145 %ext = zext <2 x i32> %ld to <2 x i64> 146 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out 147 ret void 148} 149 150; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64: 151; SICIVI: s_mov_b32 m0, -1 152; GFX9-NOT: m0 153 154define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { 155 %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in 156 %ext = sext <2 x i32> %ld to <2 x i64> 157 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out 158 ret void 159} 160 161; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64: 162; SICIVI: s_mov_b32 m0, -1 163; GFX9-NOT: m0 164 165define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { 166 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in 167 %ext = zext <4 x i32> %ld to <4 x i64> 168 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out 169 ret void 170} 171 172; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64: 173; SICIVI: s_mov_b32 m0, -1 174; GFX9-NOT: m0 175 176define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { 177 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in 178 %ext = sext <4 x i32> %ld to <4 x i64> 179 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out 180 ret void 181} 182 183; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load. 184; FUNC-LABEL: {{^}}local_v4i32_to_128: 185 186; SI-NOT: ds_read_b128 187; SI-NOT: ds_write_b128 188 189; CIVI: ds_read_b128 190; CIVI: ds_write_b128 191 192; EG: LDS_READ_RET 193; EG: LDS_READ_RET 194; EG: LDS_READ_RET 195; EG: LDS_READ_RET 196define amdgpu_kernel void @local_v4i32_to_128(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) { 197 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16 198 store <4 x i32> %ld, <4 x i32> addrspace(3)* %out, align 16 199 ret void 200} 201 202; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64: 203; SICIVI: s_mov_b32 m0, -1 204; GFX9-NOT: m0 205 206define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { 207 %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in 208 %ext = zext <8 x i32> %ld to <8 x i64> 209 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out 210 ret void 211} 212 213; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64: 214; SICIVI: s_mov_b32 m0, -1 215; GFX9-NOT: m0 216 217define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { 218 %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in 219 %ext = sext <8 x i32> %ld to <8 x i64> 220 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out 221 ret void 222} 223 224; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64: 225; SICIVI: s_mov_b32 m0, -1 226; GFX9-NOT: m0 227 228define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { 229 %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in 230 %ext = sext <16 x i32> %ld to <16 x i64> 231 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out 232 ret void 233} 234 235; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64 236; SICIVI: s_mov_b32 m0, -1 237; GFX9-NOT: m0 238 239define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { 240 %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in 241 %ext = zext <16 x i32> %ld to <16 x i64> 242 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out 243 ret void 244} 245 246; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64: 247; SICIVI: s_mov_b32 m0, -1 248; GFX9-NOT: m0 249 250define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { 251 %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in 252 %ext = sext <32 x i32> %ld to <32 x i64> 253 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out 254 ret void 255} 256 257; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64: 258; SICIVI: s_mov_b32 m0, -1 259; GFX9-NOT: m0 260 261define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { 262 %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in 263 %ext = zext <32 x i32> %ld to <32 x i64> 264 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out 265 ret void 266} 267 268attributes #0 = { nounwind } 269