1; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s 2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 5 6; GCN-LABEL: {{^}}local_i32_load 7; SICIVI: s_mov_b32 m0 8; GFX9-NOT: m0 9 10; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 11; GCN: buffer_store_dword [[REG]], 12define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { 13 %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 14 %val = load i32, i32 addrspace(3)* %gep, align 4 15 store i32 %val, i32 addrspace(1)* %out, align 4 16 ret void 17} 18 19; GCN-LABEL: {{^}}local_i32_load_0_offset 20; SICIVI: s_mov_b32 m0 21; GFX9-NOT: m0 22 23; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} 24; GCN: buffer_store_dword [[REG]], 25define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { 26 %val = load i32, i32 addrspace(3)* %in, align 4 27 store i32 %val, i32 addrspace(1)* %out, align 4 28 ret void 29} 30 31; GCN-LABEL: {{^}}local_i8_load_i16_max_offset: 32; SICIVI: s_mov_b32 m0 33; GFX9-NOT: m0 34 35; GCN-NOT: add 36; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 37; GCN: buffer_store_byte [[REG]], 38define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { 39 %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 40 %val = load i8, i8 addrspace(3)* %gep, align 4 41 store i8 %val, i8 addrspace(1)* %out, align 4 42 ret void 43} 44 45; GCN-LABEL: {{^}}local_i8_load_over_i16_max_offset: 46; SICIVI-DAG: s_mov_b32 m0 47; GFX9-NOT: m0 48 49; The LDS offset will be 65536 bytes, which is larger than the size of LDS on 50; SI, which is why it is being OR'd with the base pointer. 51; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 52; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 53; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 54; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 55 56; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] 57; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] 58; GCN: buffer_store_byte [[REG]], 59define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { 60 %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 61 %val = load i8, i8 addrspace(3)* %gep, align 4 62 store i8 %val, i8 addrspace(1)* %out, align 4 63 ret void 64} 65 66; GCN-LABEL: {{^}}local_i64_load: 67; SICIVI: s_mov_b32 m0 68; GFX9-NOT: m0 69 70; GCN-NOT: add 71; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 72; GCN: buffer_store_dwordx2 [[REG]], 73define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { 74 %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 75 %val = load i64, i64 addrspace(3)* %gep, align 8 76 store i64 %val, i64 addrspace(1)* %out, align 8 77 ret void 78} 79 80; GCN-LABEL: {{^}}local_i64_load_0_offset 81; SICIVI: s_mov_b32 m0 82; GFX9-NOT: m0 83 84; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} 85; GCN: buffer_store_dwordx2 [[REG]], 86define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { 87 %val = load i64, i64 addrspace(3)* %in, align 8 88 store i64 %val, i64 addrspace(1)* %out, align 8 89 ret void 90} 91 92; GCN-LABEL: {{^}}local_f64_load: 93; SICIVI: s_mov_b32 m0 94; GFX9-NOT: m0 95 96; GCN-NOT: add 97; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 98; GCN: buffer_store_dwordx2 [[REG]], 99define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { 100 %gep = getelementptr double, double addrspace(3)* %in, i32 7 101 %val = load double, double addrspace(3)* %gep, align 8 102 store double %val, double addrspace(1)* %out, align 8 103 ret void 104} 105 106; GCN-LABEL: {{^}}local_f64_load_0_offset 107; SICIVI: s_mov_b32 m0 108; GFX9-NOT: m0 109 110; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} 111; GCN: buffer_store_dwordx2 [[REG]], 112define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { 113 %val = load double, double addrspace(3)* %in, align 8 114 store double %val, double addrspace(1)* %out, align 8 115 ret void 116} 117 118; GCN-LABEL: {{^}}local_i64_store: 119; SICIVI: s_mov_b32 m0 120; GFX9-NOT: m0 121 122; GCN-NOT: add 123; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 124define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind { 125 %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 126 store i64 5678, i64 addrspace(3)* %gep, align 8 127 ret void 128} 129 130; GCN-LABEL: {{^}}local_i64_store_0_offset: 131; SICIVI: s_mov_b32 m0 132; GFX9-NOT: m0 133 134; GCN-NOT: add 135; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} 136define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { 137 store i64 1234, i64 addrspace(3)* %out, align 8 138 ret void 139} 140 141; GCN-LABEL: {{^}}local_f64_store: 142; SICIVI: s_mov_b32 m0 143; GFX9-NOT: m0 144 145; GCN-NOT: add 146; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 147define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind { 148 %gep = getelementptr double, double addrspace(3)* %out, i32 7 149 store double 16.0, double addrspace(3)* %gep, align 8 150 ret void 151} 152 153; GCN-LABEL: {{^}}local_f64_store_0_offset 154; SICIVI: s_mov_b32 m0 155; GFX9-NOT: m0 156 157; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} 158define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { 159 store double 20.0, double addrspace(3)* %out, align 8 160 ret void 161} 162 163; GCN-LABEL: {{^}}local_v2i64_store: 164; SICIVI: s_mov_b32 m0 165; GFX9-NOT: m0 166 167; GCN-NOT: add 168; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 169; GCN: s_endpgm 170define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { 171 %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 172 store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16 173 ret void 174} 175 176; GCN-LABEL: {{^}}local_v2i64_store_0_offset: 177; SICIVI: s_mov_b32 m0 178; GFX9-NOT: m0 179 180; GCN-NOT: add 181; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 182; GCN: s_endpgm 183define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { 184 store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16 185 ret void 186} 187 188; GCN-LABEL: {{^}}local_v4i64_store: 189; SICIVI: s_mov_b32 m0 190; GFX9-NOT: m0 191 192; GCN-NOT: add 193; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 194; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 195; GCN: s_endpgm 196define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { 197 %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 198 store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16 199 ret void 200} 201 202; GCN-LABEL: {{^}}local_v4i64_store_0_offset: 203; SICIVI: s_mov_b32 m0 204; GFX9-NOT: m0 205 206; GCN-NOT: add 207; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 208; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 209; GCN: s_endpgm 210define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { 211 store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16 212 ret void 213} 214