1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7
8define amdgpu_kernel void @global_nontemporal_load_0(
9; GFX6-LABEL: global_nontemporal_load_0:
10; GFX6:       ; %bb.0: ; %entry
11; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; GFX6-NEXT:    s_mov_b32 s7, 0xf000
13; GFX6-NEXT:    s_mov_b32 s6, -1
14; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
16; GFX6-NEXT:    s_mov_b32 s4, s2
17; GFX6-NEXT:    s_mov_b32 s5, s3
18; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
19; GFX6-NEXT:    v_mov_b32_e32 v0, s0
20; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
21; GFX6-NEXT:    s_endpgm
22;
23; GFX7-LABEL: global_nontemporal_load_0:
24; GFX7:       ; %bb.0: ; %entry
25; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
26; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
28; GFX7-NEXT:    v_mov_b32_e32 v0, s2
29; GFX7-NEXT:    v_mov_b32_e32 v1, s3
30; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX7-NEXT:    v_mov_b32_e32 v2, s0
32; GFX7-NEXT:    flat_store_dword v[0:1], v2
33; GFX7-NEXT:    s_endpgm
34;
35; GFX10-WGP-LABEL: global_nontemporal_load_0:
36; GFX10-WGP:       ; %bb.0: ; %entry
37; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
38; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
39; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
41; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
43; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
44; GFX10-WGP-NEXT:    s_endpgm
45;
46; GFX10-CU-LABEL: global_nontemporal_load_0:
47; GFX10-CU:       ; %bb.0: ; %entry
48; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
49; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
50; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
52; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
54; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
55; GFX10-CU-NEXT:    s_endpgm
56;
57; SKIP-CACHE-INV-LABEL: global_nontemporal_load_0:
58; SKIP-CACHE-INV:       ; %bb.0: ; %entry
59; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
60; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
61; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
62; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
63; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
64; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
65; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
66; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
67; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
68; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
69; SKIP-CACHE-INV-NEXT:    s_endpgm
70    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
71entry:
72  %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
73  store i32 %val, i32 addrspace(1)* %out
74  ret void
75}
76
77define amdgpu_kernel void @global_nontemporal_load_1(
78; GFX6-LABEL: global_nontemporal_load_1:
79; GFX6:       ; %bb.0: ; %entry
80; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
81; GFX6-NEXT:    s_mov_b32 s3, 0xf000
82; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
83; GFX6-NEXT:    v_mov_b32_e32 v1, 0
84; GFX6-NEXT:    s_mov_b32 s2, -1
85; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
86; GFX6-NEXT:    s_mov_b32 s0, s6
87; GFX6-NEXT:    s_mov_b32 s1, s7
88; GFX6-NEXT:    s_mov_b32 s6, 0
89; GFX6-NEXT:    s_mov_b32 s7, s3
90; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc
91; GFX6-NEXT:    s_waitcnt vmcnt(0)
92; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
93; GFX6-NEXT:    s_endpgm
94;
95; GFX7-LABEL: global_nontemporal_load_1:
96; GFX7:       ; %bb.0: ; %entry
97; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
98; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
99; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX7-NEXT:    v_mov_b32_e32 v3, s1
101; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
102; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
103; GFX7-NEXT:    flat_load_dword v2, v[2:3] glc slc
104; GFX7-NEXT:    v_mov_b32_e32 v0, s2
105; GFX7-NEXT:    v_mov_b32_e32 v1, s3
106; GFX7-NEXT:    s_waitcnt vmcnt(0)
107; GFX7-NEXT:    flat_store_dword v[0:1], v2
108; GFX7-NEXT:    s_endpgm
109;
110; GFX10-WGP-LABEL: global_nontemporal_load_1:
111; GFX10-WGP:       ; %bb.0: ; %entry
112; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
113; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
114; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
115; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX10-WGP-NEXT:    global_load_dword v0, v0, s[0:1] slc
117; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
118; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[2:3]
119; GFX10-WGP-NEXT:    s_endpgm
120;
121; GFX10-CU-LABEL: global_nontemporal_load_1:
122; GFX10-CU:       ; %bb.0: ; %entry
123; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
124; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
125; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
126; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX10-CU-NEXT:    global_load_dword v0, v0, s[0:1] slc
128; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
129; GFX10-CU-NEXT:    global_store_dword v1, v0, s[2:3]
130; GFX10-CU-NEXT:    s_endpgm
131;
132; SKIP-CACHE-INV-LABEL: global_nontemporal_load_1:
133; SKIP-CACHE-INV:       ; %bb.0: ; %entry
134; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
135; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
136; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
137; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
138; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
139; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
140; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s6
141; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
142; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
143; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
144; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc
145; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
146; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
147; SKIP-CACHE-INV-NEXT:    s_endpgm
148    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
149entry:
150  %tid = call i32 @llvm.amdgcn.workitem.id.x()
151  %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
152  %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0
153  store i32 %val, i32 addrspace(1)* %out
154  ret void
155}
156
157define amdgpu_kernel void @global_nontemporal_store_0(
158; GFX6-LABEL: global_nontemporal_store_0:
159; GFX6:       ; %bb.0: ; %entry
160; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
161; GFX6-NEXT:    s_mov_b32 s7, 0xf000
162; GFX6-NEXT:    s_mov_b32 s6, -1
163; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
165; GFX6-NEXT:    s_mov_b32 s4, s2
166; GFX6-NEXT:    s_mov_b32 s5, s3
167; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX6-NEXT:    v_mov_b32_e32 v0, s0
169; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0 glc slc
170; GFX6-NEXT:    s_endpgm
171;
172; GFX7-LABEL: global_nontemporal_store_0:
173; GFX7:       ; %bb.0: ; %entry
174; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
175; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
176; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
177; GFX7-NEXT:    v_mov_b32_e32 v0, s2
178; GFX7-NEXT:    v_mov_b32_e32 v1, s3
179; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX7-NEXT:    v_mov_b32_e32 v2, s0
181; GFX7-NEXT:    flat_store_dword v[0:1], v2 glc slc
182; GFX7-NEXT:    s_endpgm
183;
184; GFX10-WGP-LABEL: global_nontemporal_store_0:
185; GFX10-WGP:       ; %bb.0: ; %entry
186; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
187; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
188; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
190; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
192; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3] slc
193; GFX10-WGP-NEXT:    s_endpgm
194;
195; GFX10-CU-LABEL: global_nontemporal_store_0:
196; GFX10-CU:       ; %bb.0: ; %entry
197; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
198; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
199; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
200; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
201; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
202; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
203; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3] slc
204; GFX10-CU-NEXT:    s_endpgm
205;
206; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0:
207; SKIP-CACHE-INV:       ; %bb.0: ; %entry
208; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
209; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
210; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
211; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
212; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
213; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
214; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
215; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
216; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
217; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0 glc slc
218; SKIP-CACHE-INV-NEXT:    s_endpgm
219    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
220entry:
221  %val = load i32, i32 addrspace(1)* %in, align 4
222  store i32 %val, i32 addrspace(1)* %out, !nontemporal !0
223  ret void
224}
225
226define amdgpu_kernel void @global_nontemporal_store_1(
227; GFX6-LABEL: global_nontemporal_store_1:
228; GFX6:       ; %bb.0: ; %entry
229; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
230; GFX6-NEXT:    s_mov_b32 s7, 0xf000
231; GFX6-NEXT:    s_mov_b32 s6, 0
232; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
233; GFX6-NEXT:    v_mov_b32_e32 v1, 0
234; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
236; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
237; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
238; GFX6-NEXT:    v_mov_b32_e32 v2, s0
239; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc
240; GFX6-NEXT:    s_endpgm
241;
242; GFX7-LABEL: global_nontemporal_store_1:
243; GFX7:       ; %bb.0: ; %entry
244; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
245; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
246; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
247; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
248; GFX7-NEXT:    v_mov_b32_e32 v1, s3
249; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
250; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
251; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX7-NEXT:    v_mov_b32_e32 v2, s0
253; GFX7-NEXT:    flat_store_dword v[0:1], v2 glc slc
254; GFX7-NEXT:    s_endpgm
255;
256; GFX10-WGP-LABEL: global_nontemporal_store_1:
257; GFX10-WGP:       ; %bb.0: ; %entry
258; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
259; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
260; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
262; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
264; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3] slc
265; GFX10-WGP-NEXT:    s_endpgm
266;
267; GFX10-CU-LABEL: global_nontemporal_store_1:
268; GFX10-CU:       ; %bb.0: ; %entry
269; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
270; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
271; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
273; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
274; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
275; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3] slc
276; GFX10-CU-NEXT:    s_endpgm
277;
278; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1:
279; SKIP-CACHE-INV:       ; %bb.0: ; %entry
280; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
281; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
282; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
283; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
284; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
285; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
286; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
287; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
288; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
289; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
290; SKIP-CACHE-INV-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc
291; SKIP-CACHE-INV-NEXT:    s_endpgm
292    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
293entry:
294  %tid = call i32 @llvm.amdgcn.workitem.id.x()
295  %val = load i32, i32 addrspace(1)* %in, align 4
296  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
297  store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0
298  ret void
299}
300
301!0 = !{i32 1}
302declare i32 @llvm.amdgcn.workitem.id.x()
303