1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
4; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
5
6; Testing for ds_read/write_128
7; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=SI,FUNC %s
8; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
9; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
10
11; FUNC-LABEL: {{^}}local_load_i32:
12; GCN-NOT: s_wqm_b64
13; SICIVI: s_mov_b32 m0, -1
14; GFX9-NOT: m0
15; GCN: ds_read_b32
16
17; EG: LDS_READ_RET
18define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
19entry:
20  %ld = load i32, i32 addrspace(3)* %in
21  store i32 %ld, i32 addrspace(3)* %out
22  ret void
23}
24
25; FUNC-LABEL: {{^}}local_load_v2i32:
26; SICIVI: s_mov_b32 m0, -1
27; GFX9-NOT: m0
28
29; GCN: ds_read_b64
30define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
31entry:
32  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
33  store <2 x i32> %ld, <2 x i32> addrspace(3)* %out
34  ret void
35}
36
37; FUNC-LABEL: {{^}}local_load_v3i32:
38; SICIVI: s_mov_b32 m0, -1
39; GFX9-NOT: m0
40
41; GCN-DAG: ds_read_b64
42; GCN-DAG: ds_read_b32
43define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
44entry:
45  %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in
46  store <3 x i32> %ld, <3 x i32> addrspace(3)* %out
47  ret void
48}
49
50; FUNC-LABEL: {{^}}local_load_v4i32:
51; SICIVI: s_mov_b32 m0, -1
52; GFX9-NOT: m0
53
54; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
55
56define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
57entry:
58  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
59  store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
60  ret void
61}
62
63; FUNC-LABEL: {{^}}local_load_v8i32:
64; SICIVI: s_mov_b32 m0, -1
65; GFX9-NOT: m0
66
67; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
68; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
69define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
70entry:
71  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
72  store <8 x i32> %ld, <8 x i32> addrspace(3)* %out
73  ret void
74}
75
76; FUNC-LABEL: {{^}}local_load_v16i32:
77; SICIVI: s_mov_b32 m0, -1
78; GFX9-NOT: m0
79
80; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}}
81; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}}
82; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
83; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
84; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
85; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
86; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
87; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
88define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
89entry:
90  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
91  store <16 x i32> %ld, <16 x i32> addrspace(3)* %out
92  ret void
93}
94
95; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
96; SICIVI: s_mov_b32 m0, -1
97; GFX9-NOT: m0
98
99define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
100  %ld = load i32, i32 addrspace(3)* %in
101  %ext = zext i32 %ld to i64
102  store i64 %ext, i64 addrspace(3)* %out
103  ret void
104}
105
106; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
107; SICIVI: s_mov_b32 m0, -1
108; GFX9-NOT: m0
109
110define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
111  %ld = load i32, i32 addrspace(3)* %in
112  %ext = sext i32 %ld to i64
113  store i64 %ext, i64 addrspace(3)* %out
114  ret void
115}
116
117; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
118; SICIVI: s_mov_b32 m0, -1
119; GFX9-NOT: m0
120
121define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
122  %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
123  %ext = zext <1 x i32> %ld to <1 x i64>
124  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
125  ret void
126}
127
128; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
129; SICIVI: s_mov_b32 m0, -1
130; GFX9-NOT: m0
131
132define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
133  %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
134  %ext = sext <1 x i32> %ld to <1 x i64>
135  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
136  ret void
137}
138
139; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
140; SICIVI: s_mov_b32 m0, -1
141; GFX9-NOT: m0
142
143define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
144  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
145  %ext = zext <2 x i32> %ld to <2 x i64>
146  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
147  ret void
148}
149
150; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
151; SICIVI: s_mov_b32 m0, -1
152; GFX9-NOT: m0
153
154define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
155  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
156  %ext = sext <2 x i32> %ld to <2 x i64>
157  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
158  ret void
159}
160
161; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
162; SICIVI: s_mov_b32 m0, -1
163; GFX9-NOT: m0
164
165define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
166  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
167  %ext = zext <4 x i32> %ld to <4 x i64>
168  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
169  ret void
170}
171
172; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
173; SICIVI: s_mov_b32 m0, -1
174; GFX9-NOT: m0
175
176define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
177  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
178  %ext = sext <4 x i32> %ld to <4 x i64>
179  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
180  ret void
181}
182
183; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
184; FUNC-LABEL: {{^}}local_v4i32_to_128:
185
186; SI-NOT: ds_read_b128
187; SI-NOT: ds_write_b128
188
189; CIVI: ds_read_b128
190; CIVI: ds_write_b128
191
192; EG: LDS_READ_RET
193; EG: LDS_READ_RET
194; EG: LDS_READ_RET
195; EG: LDS_READ_RET
196define amdgpu_kernel void @local_v4i32_to_128(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) {
197  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
198  store <4 x i32> %ld, <4 x i32> addrspace(3)* %out, align 16
199  ret void
200}
201
202; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
203; SICIVI: s_mov_b32 m0, -1
204; GFX9-NOT: m0
205
206define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
207  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
208  %ext = zext <8 x i32> %ld to <8 x i64>
209  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
210  ret void
211}
212
213; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
214; SICIVI: s_mov_b32 m0, -1
215; GFX9-NOT: m0
216
217define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
218  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
219  %ext = sext <8 x i32> %ld to <8 x i64>
220  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
221  ret void
222}
223
224; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
225; SICIVI: s_mov_b32 m0, -1
226; GFX9-NOT: m0
227
228define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
229  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
230  %ext = sext <16 x i32> %ld to <16 x i64>
231  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
232  ret void
233}
234
235; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
236; SICIVI: s_mov_b32 m0, -1
237; GFX9-NOT: m0
238
239define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
240  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
241  %ext = zext <16 x i32> %ld to <16 x i64>
242  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
243  ret void
244}
245
246; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
247; SICIVI: s_mov_b32 m0, -1
248; GFX9-NOT: m0
249
250define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
251  %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
252  %ext = sext <32 x i32> %ld to <32 x i64>
253  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
254  ret void
255}
256
257; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
258; SICIVI: s_mov_b32 m0, -1
259; GFX9-NOT: m0
260
261define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
262  %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
263  %ext = zext <32 x i32> %ld to <32 x i64>
264  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
265  ret void
266}
267
268attributes #0 = { nounwind }
269