1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI
3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI
4; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9
5
6define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
7; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
8; SI:       ; %bb.0:
9; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
10; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
11; SI-NEXT:    s_mov_b32 s7, 0xf000
12; SI-NEXT:    s_mov_b32 s6, -1
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    v_mov_b32_e32 v0, s3
15; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, s2, v0
16; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
17; SI-NEXT:    s_endpgm
18;
19; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
20; VI:       ; %bb.0:
21; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
22; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
23; VI-NEXT:    s_waitcnt lgkmcnt(0)
24; VI-NEXT:    v_mov_b32_e32 v0, s1
25; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, v0
26; VI-NEXT:    v_mov_b32_e32 v0, s2
27; VI-NEXT:    v_mov_b32_e32 v1, s3
28; VI-NEXT:    flat_store_dword v[0:1], v2
29; VI-NEXT:    s_endpgm
30;
31; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
32; GFX9:       ; %bb.0:
33; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
34; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
35; GFX9-NEXT:    v_mov_b32_e32 v0, 0
36; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX9-NEXT:    v_mov_b32_e32 v1, s1
38; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s0, v1
39; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
40; GFX9-NEXT:    s_endpgm
41  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
42  store <2 x half> %result, <2 x half> addrspace(1)* %out
43  ret void
44}
45
46define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
47; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
48; SI:       ; %bb.0:
49; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
50; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
51; SI-NEXT:    s_mov_b32 s7, 0xf000
52; SI-NEXT:    s_mov_b32 s6, -1
53; SI-NEXT:    s_waitcnt lgkmcnt(0)
54; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v0, s2, s2
55; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
56; SI-NEXT:    s_endpgm
57;
58; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
59; VI:       ; %bb.0:
60; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
61; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
62; VI-NEXT:    s_waitcnt lgkmcnt(0)
63; VI-NEXT:    v_mov_b32_e32 v0, s2
64; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, s0
65; VI-NEXT:    v_mov_b32_e32 v1, s3
66; VI-NEXT:    flat_store_dword v[0:1], v2
67; VI-NEXT:    s_endpgm
68;
69; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
70; GFX9:       ; %bb.0:
71; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
72; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
73; GFX9-NEXT:    v_mov_b32_e32 v0, 0
74; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
75; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s0, s0
76; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
77; GFX9-NEXT:    s_endpgm
78  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
79  store <2 x half> %result, <2 x half> addrspace(1)* %out
80  ret void
81}
82
83define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
84; GCN-LABEL: s_cvt_pkrtz_undef_undef:
85; GCN:       ; %bb.0:
86; GCN-NEXT:    s_endpgm
87  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
88  store <2 x half> %result, <2 x half> addrspace(1)* %out
89  ret void
90}
91
92define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
93; SI-LABEL: v_cvt_pkrtz_v2f16_f32:
94; SI:       ; %bb.0:
95; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
96; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
97; SI-NEXT:    s_mov_b32 s3, 0xf000
98; SI-NEXT:    s_mov_b32 s2, 0
99; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
100; SI-NEXT:    v_mov_b32_e32 v1, 0
101; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
102; SI-NEXT:    s_waitcnt lgkmcnt(0)
103; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
104; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
105; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
106; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
107; SI-NEXT:    s_waitcnt vmcnt(0)
108; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
109; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
110; SI-NEXT:    s_endpgm
111;
112; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
113; VI:       ; %bb.0:
114; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
115; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
116; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
117; VI-NEXT:    s_waitcnt lgkmcnt(0)
118; VI-NEXT:    v_mov_b32_e32 v1, s7
119; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
120; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
121; VI-NEXT:    v_mov_b32_e32 v3, s1
122; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
123; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
124; VI-NEXT:    flat_load_dword v0, v[0:1]
125; VI-NEXT:    flat_load_dword v1, v[2:3]
126; VI-NEXT:    v_mov_b32_e32 v5, s5
127; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
128; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
129; VI-NEXT:    s_waitcnt vmcnt(0)
130; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
131; VI-NEXT:    flat_store_dword v[4:5], v0
132; VI-NEXT:    s_endpgm
133;
134; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
135; GFX9:       ; %bb.0:
136; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
137; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
138; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
141; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
142; GFX9-NEXT:    s_waitcnt vmcnt(0)
143; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, v2
144; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
145; GFX9-NEXT:    s_endpgm
146  %tid = call i32 @llvm.amdgcn.workitem.id.x()
147  %tid.ext = sext i32 %tid to i64
148  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
149  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
150  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
151  %a = load volatile float, float addrspace(1)* %a.gep
152  %b = load volatile float, float addrspace(1)* %b.gep
153  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
154  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
155  ret void
156}
157
158define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
159; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
160; SI:       ; %bb.0:
161; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
162; SI-NEXT:    s_mov_b32 s7, 0xf000
163; SI-NEXT:    s_mov_b32 s6, 0
164; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
165; SI-NEXT:    v_mov_b32_e32 v1, 0
166; SI-NEXT:    s_waitcnt lgkmcnt(0)
167; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
168; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
169; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
170; SI-NEXT:    s_waitcnt vmcnt(0)
171; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
172; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
173; SI-NEXT:    s_endpgm
174;
175; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
176; VI:       ; %bb.0:
177; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
178; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
179; VI-NEXT:    s_waitcnt lgkmcnt(0)
180; VI-NEXT:    v_mov_b32_e32 v1, s3
181; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
182; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
183; VI-NEXT:    flat_load_dword v0, v[0:1]
184; VI-NEXT:    v_mov_b32_e32 v3, s1
185; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
186; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
187; VI-NEXT:    s_waitcnt vmcnt(0)
188; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, 1.0
189; VI-NEXT:    flat_store_dword v[2:3], v0
190; VI-NEXT:    s_endpgm
191;
192; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
193; GFX9:       ; %bb.0:
194; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
195; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
196; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
197; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
198; GFX9-NEXT:    s_waitcnt vmcnt(0)
199; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, 1.0
200; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
201; GFX9-NEXT:    s_endpgm
202  %tid = call i32 @llvm.amdgcn.workitem.id.x()
203  %tid.ext = sext i32 %tid to i64
204  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
205  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
206  %a = load volatile float, float addrspace(1)* %a.gep
207  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
208  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
209  ret void
210}
211
212define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
213; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
214; SI:       ; %bb.0:
215; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
216; SI-NEXT:    s_mov_b32 s7, 0xf000
217; SI-NEXT:    s_mov_b32 s6, 0
218; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
219; SI-NEXT:    v_mov_b32_e32 v1, 0
220; SI-NEXT:    s_waitcnt lgkmcnt(0)
221; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
222; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
223; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
224; SI-NEXT:    s_waitcnt vmcnt(0)
225; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
226; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
227; SI-NEXT:    s_endpgm
228;
229; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
230; VI:       ; %bb.0:
231; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
232; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
233; VI-NEXT:    s_waitcnt lgkmcnt(0)
234; VI-NEXT:    v_mov_b32_e32 v1, s3
235; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
236; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
237; VI-NEXT:    flat_load_dword v0, v[0:1]
238; VI-NEXT:    v_mov_b32_e32 v3, s1
239; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
240; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
241; VI-NEXT:    s_waitcnt vmcnt(0)
242; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, 1.0, v0
243; VI-NEXT:    flat_store_dword v[2:3], v0
244; VI-NEXT:    s_endpgm
245;
246; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
247; GFX9:       ; %bb.0:
248; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
249; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
250; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
252; GFX9-NEXT:    s_waitcnt vmcnt(0)
253; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, 1.0, v1
254; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
255; GFX9-NEXT:    s_endpgm
256  %tid = call i32 @llvm.amdgcn.workitem.id.x()
257  %tid.ext = sext i32 %tid to i64
258  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
259  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
260  %a = load volatile float, float addrspace(1)* %a.gep
261  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
262  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
263  ret void
264}
265
266define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
267; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
268; SI:       ; %bb.0:
269; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
270; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
271; SI-NEXT:    s_mov_b32 s3, 0xf000
272; SI-NEXT:    s_mov_b32 s2, 0
273; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
274; SI-NEXT:    v_mov_b32_e32 v1, 0
275; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
276; SI-NEXT:    s_waitcnt lgkmcnt(0)
277; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
278; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
279; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
280; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
281; SI-NEXT:    s_waitcnt vmcnt(0)
282; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
283; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
284; SI-NEXT:    s_endpgm
285;
286; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
287; VI:       ; %bb.0:
288; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
289; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
290; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
291; VI-NEXT:    s_waitcnt lgkmcnt(0)
292; VI-NEXT:    v_mov_b32_e32 v1, s7
293; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
294; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
295; VI-NEXT:    v_mov_b32_e32 v3, s1
296; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
297; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
298; VI-NEXT:    flat_load_dword v0, v[0:1]
299; VI-NEXT:    flat_load_dword v1, v[2:3]
300; VI-NEXT:    v_mov_b32_e32 v5, s5
301; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
302; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
303; VI-NEXT:    s_waitcnt vmcnt(0)
304; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -v0, v1
305; VI-NEXT:    flat_store_dword v[4:5], v0
306; VI-NEXT:    s_endpgm
307;
308; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
309; GFX9:       ; %bb.0:
310; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
311; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
312; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
315; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
316; GFX9-NEXT:    s_waitcnt vmcnt(0)
317; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -v1, v2
318; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
319; GFX9-NEXT:    s_endpgm
320  %tid = call i32 @llvm.amdgcn.workitem.id.x()
321  %tid.ext = sext i32 %tid to i64
322  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
323  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
324  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
325  %a = load volatile float, float addrspace(1)* %a.gep
326  %b = load volatile float, float addrspace(1)* %b.gep
327  %neg.a = fsub float -0.0, %a
328  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
329  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
330  ret void
331}
332
333define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
334; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
335; SI:       ; %bb.0:
336; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
337; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
338; SI-NEXT:    s_mov_b32 s3, 0xf000
339; SI-NEXT:    s_mov_b32 s2, 0
340; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
341; SI-NEXT:    v_mov_b32_e32 v1, 0
342; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
343; SI-NEXT:    s_waitcnt lgkmcnt(0)
344; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
345; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
346; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
347; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
348; SI-NEXT:    s_waitcnt vmcnt(0)
349; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
350; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
351; SI-NEXT:    s_endpgm
352;
353; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
354; VI:       ; %bb.0:
355; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
356; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
357; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
358; VI-NEXT:    s_waitcnt lgkmcnt(0)
359; VI-NEXT:    v_mov_b32_e32 v1, s7
360; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
361; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
362; VI-NEXT:    v_mov_b32_e32 v3, s1
363; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
364; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
365; VI-NEXT:    flat_load_dword v0, v[0:1]
366; VI-NEXT:    flat_load_dword v1, v[2:3]
367; VI-NEXT:    v_mov_b32_e32 v5, s5
368; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
369; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
370; VI-NEXT:    s_waitcnt vmcnt(0)
371; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, -v1
372; VI-NEXT:    flat_store_dword v[4:5], v0
373; VI-NEXT:    s_endpgm
374;
375; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
376; GFX9:       ; %bb.0:
377; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
378; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
379; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
380; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
382; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
383; GFX9-NEXT:    s_waitcnt vmcnt(0)
384; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, -v2
385; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
386; GFX9-NEXT:    s_endpgm
387  %tid = call i32 @llvm.amdgcn.workitem.id.x()
388  %tid.ext = sext i32 %tid to i64
389  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
390  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
391  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
392  %a = load volatile float, float addrspace(1)* %a.gep
393  %b = load volatile float, float addrspace(1)* %b.gep
394  %neg.b = fsub float -0.0, %b
395  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
396  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
397  ret void
398}
399
400define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
401; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
402; SI:       ; %bb.0:
403; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
404; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
405; SI-NEXT:    s_mov_b32 s3, 0xf000
406; SI-NEXT:    s_mov_b32 s2, 0
407; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
408; SI-NEXT:    v_mov_b32_e32 v1, 0
409; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
410; SI-NEXT:    s_waitcnt lgkmcnt(0)
411; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
412; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
413; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
414; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
415; SI-NEXT:    s_waitcnt vmcnt(0)
416; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
417; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
418; SI-NEXT:    s_endpgm
419;
420; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
421; VI:       ; %bb.0:
422; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
423; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
424; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
425; VI-NEXT:    s_waitcnt lgkmcnt(0)
426; VI-NEXT:    v_mov_b32_e32 v1, s7
427; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
428; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
429; VI-NEXT:    v_mov_b32_e32 v3, s1
430; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
431; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
432; VI-NEXT:    flat_load_dword v0, v[0:1]
433; VI-NEXT:    flat_load_dword v1, v[2:3]
434; VI-NEXT:    v_mov_b32_e32 v5, s5
435; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
436; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
437; VI-NEXT:    s_waitcnt vmcnt(0)
438; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -v0, -v1
439; VI-NEXT:    flat_store_dword v[4:5], v0
440; VI-NEXT:    s_endpgm
441;
442; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
443; GFX9:       ; %bb.0:
444; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
445; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
446; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
447; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
448; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
449; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
450; GFX9-NEXT:    s_waitcnt vmcnt(0)
451; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -v1, -v2
452; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
453; GFX9-NEXT:    s_endpgm
454  %tid = call i32 @llvm.amdgcn.workitem.id.x()
455  %tid.ext = sext i32 %tid to i64
456  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
457  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
458  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
459  %a = load volatile float, float addrspace(1)* %a.gep
460  %b = load volatile float, float addrspace(1)* %b.gep
461  %neg.a = fsub float -0.0, %a
462  %neg.b = fsub float -0.0, %b
463  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
464  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
465  ret void
466}
467
468define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
469; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
470; SI:       ; %bb.0:
471; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
472; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
473; SI-NEXT:    s_mov_b32 s3, 0xf000
474; SI-NEXT:    s_mov_b32 s2, 0
475; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
476; SI-NEXT:    v_mov_b32_e32 v1, 0
477; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
478; SI-NEXT:    s_waitcnt lgkmcnt(0)
479; SI-NEXT:    s_mov_b64 s[0:1], s[10:11]
480; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
481; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
482; SI-NEXT:    s_mov_b64 s[10:11], s[2:3]
483; SI-NEXT:    s_waitcnt vmcnt(0)
484; SI-NEXT:    v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
485; SI-NEXT:    buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
486; SI-NEXT:    s_endpgm
487;
488; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
489; VI:       ; %bb.0:
490; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
491; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
492; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
493; VI-NEXT:    s_waitcnt lgkmcnt(0)
494; VI-NEXT:    v_mov_b32_e32 v1, s7
495; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
496; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
497; VI-NEXT:    v_mov_b32_e32 v3, s1
498; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
499; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
500; VI-NEXT:    flat_load_dword v0, v[0:1]
501; VI-NEXT:    flat_load_dword v1, v[2:3]
502; VI-NEXT:    v_mov_b32_e32 v5, s5
503; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
504; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
505; VI-NEXT:    s_waitcnt vmcnt(0)
506; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
507; VI-NEXT:    flat_store_dword v[4:5], v0
508; VI-NEXT:    s_endpgm
509;
510; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
511; GFX9:       ; %bb.0:
512; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
513; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
514; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
515; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
516; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
517; GFX9-NEXT:    global_load_dword v2, v0, s[0:1]
518; GFX9-NEXT:    s_waitcnt vmcnt(0)
519; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2
520; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
521; GFX9-NEXT:    s_endpgm
522  %tid = call i32 @llvm.amdgcn.workitem.id.x()
523  %tid.ext = sext i32 %tid to i64
524  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
525  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
526  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
527  %a = load volatile float, float addrspace(1)* %a.gep
528  %b = load volatile float, float addrspace(1)* %b.gep
529  %fabs.a = call float @llvm.fabs.f32(float %a)
530  %neg.fabs.a = fsub float -0.0, %fabs.a
531  %neg.b = fsub float -0.0, %b
532  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
533  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
534  ret void
535}
536
537declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
538declare float @llvm.fabs.f32(float) #1
539declare i32 @llvm.amdgcn.workitem.id.x() #1
540
541
542attributes #0 = { nounwind }
543attributes #1 = { nounwind readnone }
544