1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7
8define amdgpu_kernel void @local_nontemporal_load_0(
9; GFX6-LABEL: local_nontemporal_load_0:
10; GFX6:       ; %bb.0: ; %entry
11; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
12; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
13; GFX6-NEXT:    s_mov_b32 m0, -1
14; GFX6-NEXT:    s_mov_b32 s3, 0xf000
15; GFX6-NEXT:    s_mov_b32 s2, -1
16; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX6-NEXT:    v_mov_b32_e32 v0, s4
18; GFX6-NEXT:    ds_read_b32 v0, v0
19; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
21; GFX6-NEXT:    s_endpgm
22;
23; GFX7-LABEL: local_nontemporal_load_0:
24; GFX7:       ; %bb.0: ; %entry
25; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
26; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
27; GFX7-NEXT:    s_mov_b32 m0, -1
28; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX7-NEXT:    v_mov_b32_e32 v0, s2
30; GFX7-NEXT:    ds_read_b32 v2, v0
31; GFX7-NEXT:    v_mov_b32_e32 v0, s0
32; GFX7-NEXT:    v_mov_b32_e32 v1, s1
33; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX7-NEXT:    flat_store_dword v[0:1], v2
35; GFX7-NEXT:    s_endpgm
36;
37; GFX10-WGP-LABEL: local_nontemporal_load_0:
38; GFX10-WGP:       ; %bb.0: ; %entry
39; GFX10-WGP-NEXT:    s_clause 0x1
40; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
41; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
42; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
43; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
45; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
46; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
48; GFX10-WGP-NEXT:    s_endpgm
49;
50; GFX10-CU-LABEL: local_nontemporal_load_0:
51; GFX10-CU:       ; %bb.0: ; %entry
52; GFX10-CU-NEXT:    s_clause 0x1
53; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
54; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
55; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
56; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
58; GFX10-CU-NEXT:    ds_read_b32 v0, v0
59; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
61; GFX10-CU-NEXT:    s_endpgm
62;
63; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0:
64; SKIP-CACHE-INV:       ; %bb.0: ; %entry
65; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
66; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
67; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
68; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
69; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
70; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
71; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
72; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
73; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
74; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
75; SKIP-CACHE-INV-NEXT:    s_endpgm
76    i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
77entry:
78  %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
79  store i32 %val, i32 addrspace(1)* %out
80  ret void
81}
82
83define amdgpu_kernel void @local_nontemporal_load_1(
84; GFX6-LABEL: local_nontemporal_load_1:
85; GFX6:       ; %bb.0: ; %entry
86; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
87; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
88; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
89; GFX6-NEXT:    s_mov_b32 m0, -1
90; GFX6-NEXT:    s_mov_b32 s3, 0xf000
91; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
93; GFX6-NEXT:    ds_read_b32 v0, v0
94; GFX6-NEXT:    s_mov_b32 s2, -1
95; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
97; GFX6-NEXT:    s_endpgm
98;
99; GFX7-LABEL: local_nontemporal_load_1:
100; GFX7:       ; %bb.0: ; %entry
101; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
102; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
103; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
104; GFX7-NEXT:    s_mov_b32 m0, -1
105; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
107; GFX7-NEXT:    ds_read_b32 v2, v0
108; GFX7-NEXT:    v_mov_b32_e32 v0, s0
109; GFX7-NEXT:    v_mov_b32_e32 v1, s1
110; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX7-NEXT:    flat_store_dword v[0:1], v2
112; GFX7-NEXT:    s_endpgm
113;
114; GFX10-WGP-LABEL: local_nontemporal_load_1:
115; GFX10-WGP:       ; %bb.0: ; %entry
116; GFX10-WGP-NEXT:    s_clause 0x1
117; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
118; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
119; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
120; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
122; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
123; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
125; GFX10-WGP-NEXT:    s_endpgm
126;
127; GFX10-CU-LABEL: local_nontemporal_load_1:
128; GFX10-CU:       ; %bb.0: ; %entry
129; GFX10-CU-NEXT:    s_clause 0x1
130; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
131; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
132; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
133; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
135; GFX10-CU-NEXT:    ds_read_b32 v0, v0
136; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
138; GFX10-CU-NEXT:    s_endpgm
139;
140; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1:
141; SKIP-CACHE-INV:       ; %bb.0: ; %entry
142; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
143; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
144; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
145; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
146; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
147; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
148; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
149; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
150; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
151; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
152; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
153; SKIP-CACHE-INV-NEXT:    s_endpgm
154    i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
155entry:
156  %tid = call i32 @llvm.amdgcn.workitem.id.x()
157  %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
158  %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
159  store i32 %val, i32 addrspace(1)* %out
160  ret void
161}
162
163define amdgpu_kernel void @local_nontemporal_store_0(
164; GFX6-LABEL: local_nontemporal_store_0:
165; GFX6:       ; %bb.0: ; %entry
166; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
167; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
168; GFX6-NEXT:    s_mov_b32 m0, -1
169; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
171; GFX6-NEXT:    v_mov_b32_e32 v0, s0
172; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX6-NEXT:    v_mov_b32_e32 v1, s1
174; GFX6-NEXT:    ds_write_b32 v0, v1
175; GFX6-NEXT:    s_endpgm
176;
177; GFX7-LABEL: local_nontemporal_store_0:
178; GFX7:       ; %bb.0: ; %entry
179; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
180; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
181; GFX7-NEXT:    s_mov_b32 m0, -1
182; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
184; GFX7-NEXT:    v_mov_b32_e32 v0, s2
185; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX7-NEXT:    v_mov_b32_e32 v1, s0
187; GFX7-NEXT:    ds_write_b32 v0, v1
188; GFX7-NEXT:    s_endpgm
189;
190; GFX10-WGP-LABEL: local_nontemporal_store_0:
191; GFX10-WGP:       ; %bb.0: ; %entry
192; GFX10-WGP-NEXT:    s_clause 0x1
193; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
194; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
195; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
197; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
198; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
200; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
201; GFX10-WGP-NEXT:    s_endpgm
202;
203; GFX10-CU-LABEL: local_nontemporal_store_0:
204; GFX10-CU:       ; %bb.0: ; %entry
205; GFX10-CU-NEXT:    s_clause 0x1
206; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
207; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
208; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
210; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
211; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
213; GFX10-CU-NEXT:    ds_write_b32 v0, v1
214; GFX10-CU-NEXT:    s_endpgm
215;
216; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0:
217; SKIP-CACHE-INV:       ; %bb.0: ; %entry
218; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
219; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
220; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
221; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
222; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
224; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
225; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
226; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
227; SKIP-CACHE-INV-NEXT:    s_endpgm
228    i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
229entry:
230  %val = load i32, i32 addrspace(1)* %in, align 4
231  store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
232  ret void
233}
234
235define amdgpu_kernel void @local_nontemporal_store_1(
236; GFX6-LABEL: local_nontemporal_store_1:
237; GFX6:       ; %bb.0: ; %entry
238; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
239; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
240; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
241; GFX6-NEXT:    s_mov_b32 m0, -1
242; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
244; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
245; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
246; GFX6-NEXT:    v_mov_b32_e32 v1, s1
247; GFX6-NEXT:    ds_write_b32 v0, v1
248; GFX6-NEXT:    s_endpgm
249;
250; GFX7-LABEL: local_nontemporal_store_1:
251; GFX7:       ; %bb.0: ; %entry
252; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
253; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
254; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
255; GFX7-NEXT:    s_mov_b32 m0, -1
256; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
258; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
259; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX7-NEXT:    v_mov_b32_e32 v1, s0
261; GFX7-NEXT:    ds_write_b32 v0, v1
262; GFX7-NEXT:    s_endpgm
263;
264; GFX10-WGP-LABEL: local_nontemporal_store_1:
265; GFX10-WGP:       ; %bb.0: ; %entry
266; GFX10-WGP-NEXT:    s_clause 0x1
267; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
268; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
269; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
270; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
271; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
272; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
274; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
275; GFX10-WGP-NEXT:    s_endpgm
276;
277; GFX10-CU-LABEL: local_nontemporal_store_1:
278; GFX10-CU:       ; %bb.0: ; %entry
279; GFX10-CU-NEXT:    s_clause 0x1
280; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
281; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
282; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
284; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
285; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
287; GFX10-CU-NEXT:    ds_write_b32 v0, v1
288; GFX10-CU-NEXT:    s_endpgm
289;
290; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1:
291; SKIP-CACHE-INV:       ; %bb.0: ; %entry
292; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
293; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
294; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
295; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
296; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
297; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
298; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
299; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
300; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
301; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
302; SKIP-CACHE-INV-NEXT:    s_endpgm
303    i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
304entry:
305  %tid = call i32 @llvm.amdgcn.workitem.id.x()
306  %val = load i32, i32 addrspace(1)* %in, align 4
307  %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
308  store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
309  ret void
310}
311
312!0 = !{i32 1}
313declare i32 @llvm.amdgcn.workitem.id.x()
314