1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7
8define amdgpu_kernel void @private_nontemporal_load_0(
9; GFX6-LABEL: private_nontemporal_load_0:
10; GFX6:       ; %bb.0: ; %entry
11; GFX6-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
12; GFX6-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
13; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
14; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
15; GFX6-NEXT:    s_mov_b32 s10, -1
16; GFX6-NEXT:    s_mov_b32 s11, 0xe8f000
17; GFX6-NEXT:    s_add_u32 s8, s8, s3
18; GFX6-NEXT:    s_addc_u32 s9, s9, 0
19; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX6-NEXT:    v_mov_b32_e32 v0, s4
21; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
22; GFX6-NEXT:    s_mov_b32 s3, 0xf000
23; GFX6-NEXT:    s_mov_b32 s2, -1
24; GFX6-NEXT:    s_waitcnt vmcnt(0)
25; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
26; GFX6-NEXT:    s_endpgm
27;
28; GFX7-LABEL: private_nontemporal_load_0:
29; GFX7:       ; %bb.0: ; %entry
30; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
31; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
32; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
33; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
34; GFX7-NEXT:    s_add_u32 s8, s8, s7
35; GFX7-NEXT:    s_addc_u32 s9, s9, 0
36; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7-NEXT:    v_mov_b32_e32 v0, s2
38; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
39; GFX7-NEXT:    v_mov_b32_e32 v0, s0
40; GFX7-NEXT:    v_mov_b32_e32 v1, s1
41; GFX7-NEXT:    s_waitcnt vmcnt(0)
42; GFX7-NEXT:    flat_store_dword v[0:1], v2
43; GFX7-NEXT:    s_endpgm
44;
45; GFX10-WGP-LABEL: private_nontemporal_load_0:
46; GFX10-WGP:       ; %bb.0: ; %entry
47; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
48; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
49; GFX10-WGP-NEXT:    s_clause 0x1
50; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
51; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
52; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
53; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
54; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
55; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
57; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen slc
58; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
59; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
60; GFX10-WGP-NEXT:    s_endpgm
61;
62; GFX10-CU-LABEL: private_nontemporal_load_0:
63; GFX10-CU:       ; %bb.0: ; %entry
64; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
65; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
66; GFX10-CU-NEXT:    s_clause 0x1
67; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
68; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
69; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
70; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
71; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
72; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
74; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen slc
75; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
76; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
77; GFX10-CU-NEXT:    s_endpgm
78;
79; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
80; SKIP-CACHE-INV:       ; %bb.0: ; %entry
81; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[8:9]
82; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s0
83; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
84; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
85; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
86; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
87; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
88; SKIP-CACHE-INV-NEXT:    s_add_u32 s8, s8, s3
89; SKIP-CACHE-INV-NEXT:    s_addc_u32 s9, s9, 0
90; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
91; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
92; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
93; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
94; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
95; SKIP-CACHE-INV-NEXT:    s_endpgm
96    i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
97entry:
98  %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
99  store i32 %val, i32 addrspace(1)* %out
100  ret void
101}
102
103define amdgpu_kernel void @private_nontemporal_load_1(
104; GFX6-LABEL: private_nontemporal_load_1:
105; GFX6:       ; %bb.0: ; %entry
106; GFX6-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
107; GFX6-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
108; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
109; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
110; GFX6-NEXT:    s_mov_b32 s10, -1
111; GFX6-NEXT:    s_mov_b32 s11, 0xe8f000
112; GFX6-NEXT:    s_add_u32 s8, s8, s3
113; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
114; GFX6-NEXT:    s_addc_u32 s9, s9, 0
115; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
117; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
118; GFX6-NEXT:    s_mov_b32 s3, 0xf000
119; GFX6-NEXT:    s_mov_b32 s2, -1
120; GFX6-NEXT:    s_waitcnt vmcnt(0)
121; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
122; GFX6-NEXT:    s_endpgm
123;
124; GFX7-LABEL: private_nontemporal_load_1:
125; GFX7:       ; %bb.0: ; %entry
126; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
127; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
128; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
129; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
130; GFX7-NEXT:    s_add_u32 s8, s8, s7
131; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
132; GFX7-NEXT:    s_addc_u32 s9, s9, 0
133; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
135; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
136; GFX7-NEXT:    v_mov_b32_e32 v0, s0
137; GFX7-NEXT:    v_mov_b32_e32 v1, s1
138; GFX7-NEXT:    s_waitcnt vmcnt(0)
139; GFX7-NEXT:    flat_store_dword v[0:1], v2
140; GFX7-NEXT:    s_endpgm
141;
142; GFX10-WGP-LABEL: private_nontemporal_load_1:
143; GFX10-WGP:       ; %bb.0: ; %entry
144; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
145; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
146; GFX10-WGP-NEXT:    s_clause 0x1
147; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
148; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
149; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
150; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
151; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
152; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
154; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen slc
155; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
156; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
157; GFX10-WGP-NEXT:    s_endpgm
158;
159; GFX10-CU-LABEL: private_nontemporal_load_1:
160; GFX10-CU:       ; %bb.0: ; %entry
161; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
162; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
163; GFX10-CU-NEXT:    s_clause 0x1
164; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
165; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
166; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
167; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
168; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
169; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
171; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen slc
172; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
173; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
174; GFX10-CU-NEXT:    s_endpgm
175;
176; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
177; SKIP-CACHE-INV:       ; %bb.0: ; %entry
178; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[8:9]
179; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s0
180; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
181; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
182; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
183; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
184; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
185; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
186; SKIP-CACHE-INV-NEXT:    s_add_u32 s8, s8, s3
187; SKIP-CACHE-INV-NEXT:    s_addc_u32 s9, s9, 0
188; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
189; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
190; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
191; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
192; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
193; SKIP-CACHE-INV-NEXT:    s_endpgm
194    i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
195entry:
196  %tid = call i32 @llvm.amdgcn.workitem.id.x()
197  %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid
198  %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0
199  store i32 %val, i32 addrspace(1)* %out
200  ret void
201}
202
203define amdgpu_kernel void @private_nontemporal_store_0(
204; GFX6-LABEL: private_nontemporal_store_0:
205; GFX6:       ; %bb.0: ; %entry
206; GFX6-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
207; GFX6-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
208; GFX6-NEXT:    s_mov_b32 s6, -1
209; GFX6-NEXT:    s_mov_b32 s7, 0xe8f000
210; GFX6-NEXT:    s_add_u32 s4, s4, s3
211; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
212; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
213; GFX6-NEXT:    s_addc_u32 s5, s5, 0
214; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
216; GFX6-NEXT:    v_mov_b32_e32 v1, s0
217; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX6-NEXT:    v_mov_b32_e32 v0, s1
219; GFX6-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
220; GFX6-NEXT:    s_endpgm
221;
222; GFX7-LABEL: private_nontemporal_store_0:
223; GFX7:       ; %bb.0: ; %entry
224; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
225; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
226; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
227; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
228; GFX7-NEXT:    s_add_u32 s8, s8, s7
229; GFX7-NEXT:    s_addc_u32 s9, s9, 0
230; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
232; GFX7-NEXT:    v_mov_b32_e32 v1, s2
233; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX7-NEXT:    v_mov_b32_e32 v0, s0
235; GFX7-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
236; GFX7-NEXT:    s_endpgm
237;
238; GFX10-WGP-LABEL: private_nontemporal_store_0:
239; GFX10-WGP:       ; %bb.0: ; %entry
240; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
241; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
242; GFX10-WGP-NEXT:    s_clause 0x1
243; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
244; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
245; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
246; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
247; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
249; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
250; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
252; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen slc
253; GFX10-WGP-NEXT:    s_endpgm
254;
255; GFX10-CU-LABEL: private_nontemporal_store_0:
256; GFX10-CU:       ; %bb.0: ; %entry
257; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
258; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
259; GFX10-CU-NEXT:    s_clause 0x1
260; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
261; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
262; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
263; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
264; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
266; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
267; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
269; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen slc
270; GFX10-CU-NEXT:    s_endpgm
271;
272; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
273; SKIP-CACHE-INV:       ; %bb.0: ; %entry
274; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
275; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
276; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
277; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
278; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
279; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
280; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
281; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
282; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
283; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
284; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
285; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
286; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
287; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
288; SKIP-CACHE-INV-NEXT:    s_endpgm
289    i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
290entry:
291  %val = load i32, i32 addrspace(1)* %in, align 4
292  store i32 %val, i32 addrspace(5)* %out, !nontemporal !0
293  ret void
294}
295
296define amdgpu_kernel void @private_nontemporal_store_1(
297; GFX6-LABEL: private_nontemporal_store_1:
298; GFX6:       ; %bb.0: ; %entry
299; GFX6-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
300; GFX6-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
301; GFX6-NEXT:    s_mov_b32 s6, -1
302; GFX6-NEXT:    s_mov_b32 s7, 0xe8f000
303; GFX6-NEXT:    s_add_u32 s4, s4, s3
304; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
305; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
306; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
307; GFX6-NEXT:    s_addc_u32 s5, s5, 0
308; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
309; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
310; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
311; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX6-NEXT:    v_mov_b32_e32 v1, s1
313; GFX6-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
314; GFX6-NEXT:    s_endpgm
315;
316; GFX7-LABEL: private_nontemporal_store_1:
317; GFX7:       ; %bb.0: ; %entry
318; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
319; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
320; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
321; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
322; GFX7-NEXT:    s_add_u32 s8, s8, s7
323; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
324; GFX7-NEXT:    s_addc_u32 s9, s9, 0
325; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
327; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
328; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX7-NEXT:    v_mov_b32_e32 v1, s0
330; GFX7-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
331; GFX7-NEXT:    s_endpgm
332;
333; GFX10-WGP-LABEL: private_nontemporal_store_1:
334; GFX10-WGP:       ; %bb.0: ; %entry
335; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
336; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
337; GFX10-WGP-NEXT:    s_clause 0x1
338; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
339; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
340; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
341; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
342; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
343; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
344; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
345; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
346; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
347; GFX10-WGP-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen slc
348; GFX10-WGP-NEXT:    s_endpgm
349;
350; GFX10-CU-LABEL: private_nontemporal_store_1:
351; GFX10-CU:       ; %bb.0: ; %entry
352; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
353; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
354; GFX10-CU-NEXT:    s_clause 0x1
355; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
356; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
357; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
358; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
359; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
361; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
362; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
364; GFX10-CU-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen slc
365; GFX10-CU-NEXT:    s_endpgm
366;
367; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
368; SKIP-CACHE-INV:       ; %bb.0: ; %entry
369; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
370; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
371; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
372; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
373; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
374; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
375; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
376; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
377; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
378; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
379; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
380; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
381; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
382; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
383; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
384; SKIP-CACHE-INV-NEXT:    s_endpgm
385    i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
386entry:
387  %tid = call i32 @llvm.amdgcn.workitem.id.x()
388  %val = load i32, i32 addrspace(1)* %in, align 4
389  %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid
390  store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0
391  ret void
392}
393
394!0 = !{i32 1}
395declare i32 @llvm.amdgcn.workitem.id.x()
396