1; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
3
4; SI-LABEL: {{^}}unaligned_load_store_i16_local:
5; SI: ds_read_u8
6; SI: ds_read_u8
7; SI: ds_write_b8
8; SI: ds_write_b8
9; SI: s_endpgm
10define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
11  %v = load i16, i16 addrspace(3)* %p, align 1
12  store i16 %v, i16 addrspace(3)* %r, align 1
13  ret void
14}
15
16; SI-LABEL: {{^}}unaligned_load_store_i16_global:
17; SI: buffer_load_ubyte
18; SI: buffer_load_ubyte
19; SI: buffer_store_byte
20; SI: buffer_store_byte
21; SI: s_endpgm
22define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
23  %v = load i16, i16 addrspace(1)* %p, align 1
24  store i16 %v, i16 addrspace(1)* %r, align 1
25  ret void
26}
27
28; SI-LABEL: {{^}}unaligned_load_store_i32_local:
29; SI: ds_read_u8
30; SI: ds_read_u8
31; SI: ds_read_u8
32; SI: ds_read_u8
33; SI: ds_write_b8
34; SI: ds_write_b8
35; SI: ds_write_b8
36; SI: ds_write_b8
37; SI: s_endpgm
38define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
39  %v = load i32, i32 addrspace(3)* %p, align 1
40  store i32 %v, i32 addrspace(3)* %r, align 1
41  ret void
42}
43
44; SI-LABEL: {{^}}unaligned_load_store_i32_global:
45; SI: buffer_load_ubyte
46; SI: buffer_load_ubyte
47; SI: buffer_load_ubyte
48; SI: buffer_load_ubyte
49; SI: buffer_store_byte
50; SI: buffer_store_byte
51; SI: buffer_store_byte
52; SI: buffer_store_byte
53define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
54  %v = load i32, i32 addrspace(1)* %p, align 1
55  store i32 %v, i32 addrspace(1)* %r, align 1
56  ret void
57}
58
59; SI-LABEL: {{^}}unaligned_load_store_i64_local:
60; SI: ds_read_u8
61; SI: ds_read_u8
62; SI: ds_read_u8
63; SI: ds_read_u8
64; SI: ds_read_u8
65; SI: ds_read_u8
66; SI: ds_read_u8
67; SI: ds_read_u8
68; SI: ds_write_b8
69; SI: ds_write_b8
70; SI: ds_write_b8
71; SI: ds_write_b8
72; SI: ds_write_b8
73; SI: ds_write_b8
74; SI: ds_write_b8
75; SI: ds_write_b8
76; SI: s_endpgm
77define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
78  %v = load i64, i64 addrspace(3)* %p, align 1
79  store i64 %v, i64 addrspace(3)* %r, align 1
80  ret void
81}
82
83; SI-LABEL: {{^}}unaligned_load_store_i64_global:
84; SI: buffer_load_ubyte
85; SI: buffer_load_ubyte
86; SI: buffer_load_ubyte
87; SI: buffer_load_ubyte
88; SI: buffer_load_ubyte
89; SI: buffer_load_ubyte
90; SI: buffer_load_ubyte
91; SI: buffer_load_ubyte
92; SI: buffer_store_byte
93; SI: buffer_store_byte
94; SI: buffer_store_byte
95; SI: buffer_store_byte
96; SI: buffer_store_byte
97; SI: buffer_store_byte
98; SI: buffer_store_byte
99; SI: buffer_store_byte
100define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
101  %v = load i64, i64 addrspace(1)* %p, align 1
102  store i64 %v, i64 addrspace(1)* %r, align 1
103  ret void
104}
105
106; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
107; SI: ds_read_u8
108; SI: ds_read_u8
109; SI: ds_read_u8
110; SI: ds_read_u8
111
112; SI: ds_read_u8
113; SI: ds_read_u8
114; SI: ds_read_u8
115; SI: ds_read_u8
116
117; SI: ds_read_u8
118; SI: ds_read_u8
119; SI: ds_read_u8
120; SI: ds_read_u8
121
122; SI: ds_read_u8
123; SI: ds_read_u8
124; SI: ds_read_u8
125; SI: ds_read_u8
126
127; SI: ds_write_b8
128; SI: ds_write_b8
129; SI: ds_write_b8
130; SI: ds_write_b8
131
132; SI: ds_write_b8
133; SI: ds_write_b8
134; SI: ds_write_b8
135; SI: ds_write_b8
136
137; SI: ds_write_b8
138; SI: ds_write_b8
139; SI: ds_write_b8
140; SI: ds_write_b8
141
142; SI: ds_write_b8
143; SI: ds_write_b8
144; SI: ds_write_b8
145; SI: ds_write_b8
146; SI: s_endpgm
147define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
148  %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
149  store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
150  ret void
151}
152
153; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
154; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
155; FIXME-SI: buffer_load_ubyte
156; FIXME-SI: buffer_load_ubyte
157; FIXME-SI: buffer_load_ubyte
158; FIXME-SI: buffer_load_ubyte
159; FIXME-SI: buffer_load_ubyte
160; FIXME-SI: buffer_load_ubyte
161; FIXME-SI: buffer_load_ubyte
162; FIXME-SI: buffer_load_ubyte
163; FIXME-SI: buffer_load_ubyte
164; FIXME-SI: buffer_load_ubyte
165; FIXME-SI: buffer_load_ubyte
166; FIXME-SI: buffer_load_ubyte
167; FIXME-SI: buffer_load_ubyte
168; FIXME-SI: buffer_load_ubyte
169; FIXME-SI: buffer_load_ubyte
170; FIXME-SI: buffer_load_ubyte
171define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
172  %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
173  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
174  ret void
175}
176
177; SI-LABEL: {{^}}load_lds_i64_align_4:
178; SI: ds_read2_b32
179; SI: s_endpgm
180define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
181  %val = load i64, i64 addrspace(3)* %in, align 4
182  store i64 %val, i64 addrspace(1)* %out, align 8
183  ret void
184}
185
186; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
187; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
188; SI: s_endpgm
189define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
190  %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
191  %val = load i64, i64 addrspace(3)* %ptr, align 4
192  store i64 %val, i64 addrspace(1)* %out, align 8
193  ret void
194}
195
196; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
197; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
198; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
199; SI: s_endpgm
200define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
201  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
202  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
203  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
204  %val = load i64, i64 addrspace(3)* %ptri64, align 4
205  store i64 %val, i64 addrspace(1)* %out, align 8
206  ret void
207}
208
209; SI-LABEL: {{^}}load_lds_i64_align_1:
210; SI: ds_read_u8
211; SI: ds_read_u8
212; SI: ds_read_u8
213; SI: ds_read_u8
214; SI: ds_read_u8
215; SI: ds_read_u8
216; SI: ds_read_u8
217; SI: ds_read_u8
218; SI: buffer_store_dwordx2
219; SI: s_endpgm
220
221define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
222  %val = load i64, i64 addrspace(3)* %in, align 1
223  store i64 %val, i64 addrspace(1)* %out, align 8
224  ret void
225}
226
227; SI-LABEL: {{^}}store_lds_i64_align_4:
228; SI: ds_write2_b32
229; SI: s_endpgm
230define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
231  store i64 %val, i64 addrspace(3)* %out, align 4
232  ret void
233}
234
235; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
236; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
237; SI: s_endpgm
238define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
239  %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
240  store i64 0, i64 addrspace(3)* %ptr, align 4
241  ret void
242}
243
244; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
245; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
246; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
247; SI: s_endpgm
248define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
249  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
250  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
251  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
252  store i64 0, i64 addrspace(3)* %out, align 4
253  ret void
254}
255