1; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
5
6; GCN-LABEL: {{^}}local_i32_load
7; SICIVI: s_mov_b32 m0
8; GFX9-NOT: m0
9
10; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
11; GCN: buffer_store_dword [[REG]],
12define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
13  %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
14  %val = load i32, i32 addrspace(3)* %gep, align 4
15  store i32 %val, i32 addrspace(1)* %out, align 4
16  ret void
17}
18
19; GCN-LABEL: {{^}}local_i32_load_0_offset
20; SICIVI: s_mov_b32 m0
21; GFX9-NOT: m0
22
23; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
24; GCN: buffer_store_dword [[REG]],
25define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
26  %val = load i32, i32 addrspace(3)* %in, align 4
27  store i32 %val, i32 addrspace(1)* %out, align 4
28  ret void
29}
30
31; GCN-LABEL: {{^}}local_i8_load_i16_max_offset:
32; SICIVI: s_mov_b32 m0
33; GFX9-NOT: m0
34
35; GCN-NOT: add
36; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
37; GCN: buffer_store_byte [[REG]],
38define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
39  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
40  %val = load i8, i8 addrspace(3)* %gep, align 4
41  store i8 %val, i8 addrspace(1)* %out, align 4
42  ret void
43}
44
45; GCN-LABEL: {{^}}local_i8_load_over_i16_max_offset:
46; SICIVI-DAG: s_mov_b32 m0
47; GFX9-NOT: m0
48
49; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
50; SI, which is why it is being OR'd with the base pointer.
51; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
52; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
53; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
54; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
55
56; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
57; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
58; GCN: buffer_store_byte [[REG]],
59define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
60  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
61  %val = load i8, i8 addrspace(3)* %gep, align 4
62  store i8 %val, i8 addrspace(1)* %out, align 4
63  ret void
64}
65
66; GCN-LABEL: {{^}}local_i64_load:
67; SICIVI: s_mov_b32 m0
68; GFX9-NOT: m0
69
70; GCN-NOT: add
71; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
72; GCN: buffer_store_dwordx2 [[REG]],
73define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
74  %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
75  %val = load i64, i64 addrspace(3)* %gep, align 8
76  store i64 %val, i64 addrspace(1)* %out, align 8
77  ret void
78}
79
80; GCN-LABEL: {{^}}local_i64_load_0_offset
81; SICIVI: s_mov_b32 m0
82; GFX9-NOT: m0
83
84; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
85; GCN: buffer_store_dwordx2 [[REG]],
86define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
87  %val = load i64, i64 addrspace(3)* %in, align 8
88  store i64 %val, i64 addrspace(1)* %out, align 8
89  ret void
90}
91
92; GCN-LABEL: {{^}}local_f64_load:
93; SICIVI: s_mov_b32 m0
94; GFX9-NOT: m0
95
96; GCN-NOT: add
97; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
98; GCN: buffer_store_dwordx2 [[REG]],
99define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
100  %gep = getelementptr double, double addrspace(3)* %in, i32 7
101  %val = load double, double addrspace(3)* %gep, align 8
102  store double %val, double addrspace(1)* %out, align 8
103  ret void
104}
105
106; GCN-LABEL: {{^}}local_f64_load_0_offset
107; SICIVI: s_mov_b32 m0
108; GFX9-NOT: m0
109
110; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
111; GCN: buffer_store_dwordx2 [[REG]],
112define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
113  %val = load double, double addrspace(3)* %in, align 8
114  store double %val, double addrspace(1)* %out, align 8
115  ret void
116}
117
118; GCN-LABEL: {{^}}local_i64_store:
119; SICIVI: s_mov_b32 m0
120; GFX9-NOT: m0
121
122; GCN-NOT: add
123; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
124define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
125  %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
126  store i64 5678, i64 addrspace(3)* %gep, align 8
127  ret void
128}
129
130; GCN-LABEL: {{^}}local_i64_store_0_offset:
131; SICIVI: s_mov_b32 m0
132; GFX9-NOT: m0
133
134; GCN-NOT: add
135; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
136define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
137  store i64 1234, i64 addrspace(3)* %out, align 8
138  ret void
139}
140
141; GCN-LABEL: {{^}}local_f64_store:
142; SICIVI: s_mov_b32 m0
143; GFX9-NOT: m0
144
145; GCN-NOT: add
146; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
147define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
148  %gep = getelementptr double, double addrspace(3)* %out, i32 7
149  store double 16.0, double addrspace(3)* %gep, align 8
150  ret void
151}
152
153; GCN-LABEL: {{^}}local_f64_store_0_offset
154; SICIVI: s_mov_b32 m0
155; GFX9-NOT: m0
156
157; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
158define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
159  store double 20.0, double addrspace(3)* %out, align 8
160  ret void
161}
162
163; GCN-LABEL: {{^}}local_v2i64_store:
164; SICIVI: s_mov_b32 m0
165; GFX9-NOT: m0
166
167; GCN-NOT: add
168; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
169; GCN: s_endpgm
170define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
171  %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
172  store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
173  ret void
174}
175
176; GCN-LABEL: {{^}}local_v2i64_store_0_offset:
177; SICIVI: s_mov_b32 m0
178; GFX9-NOT: m0
179
180; GCN-NOT: add
181; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
182; GCN: s_endpgm
183define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
184  store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
185  ret void
186}
187
188; GCN-LABEL: {{^}}local_v4i64_store:
189; SICIVI: s_mov_b32 m0
190; GFX9-NOT: m0
191
192; GCN-NOT: add
193; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
194; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
195; GCN: s_endpgm
196define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
197  %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
198  store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
199  ret void
200}
201
202; GCN-LABEL: {{^}}local_v4i64_store_0_offset:
203; SICIVI: s_mov_b32 m0
204; GFX9-NOT: m0
205
206; GCN-NOT: add
207; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
208; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
209; GCN: s_endpgm
210define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
211  store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
212  ret void
213}
214