1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3
4; half args should be promoted to float
5
6; GCN-LABEL: {{^}}load_f16_arg:
7; GCN: s_load_dword [[ARG:s[0-9]+]]
8; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
9; GCN: buffer_store_short [[CVT]]
10define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
11  store half %arg, half addrspace(1)* %out
12  ret void
13}
14
15; GCN-LABEL: {{^}}load_v2f16_arg:
16; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
17; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
18; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
19; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
20; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
21; GCN: s_endpgm
22define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
23  store <2 x half> %arg, <2 x half> addrspace(1)* %out
24  ret void
25}
26
27; GCN-LABEL: {{^}}load_v3f16_arg:
28; GCN: buffer_load_ushort
29; GCN: buffer_load_ushort
30; GCN: buffer_load_ushort
31; GCN-NOT: buffer_load
32; GCN-DAG: buffer_store_dword
33; GCN-DAG: buffer_store_short
34; GCN-NOT: buffer_store
35; GCN: s_endpgm
36define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
37  store <3 x half> %arg, <3 x half> addrspace(1)* %out
38  ret void
39}
40
41; GCN-LABEL: {{^}}load_v4f16_arg:
42; GCN: buffer_load_ushort
43; GCN: buffer_load_ushort
44; GCN: buffer_load_ushort
45; GCN: buffer_load_ushort
46; GCN: buffer_store_dwordx2
47; GCN: s_endpgm
48define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
49  store <4 x half> %arg, <4 x half> addrspace(1)* %out
50  ret void
51}
52
53; GCN-LABEL: {{^}}load_v8f16_arg:
54define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
55  store <8 x half> %arg, <8 x half> addrspace(1)* %out
56  ret void
57}
58
59; GCN-LABEL: {{^}}extload_v2f16_arg:
60define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
61  %fpext = fpext <2 x half> %in to <2 x float>
62  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
63  ret void
64}
65
66; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
67define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
68  %ext = fpext half %arg to float
69  store float %ext, float addrspace(1)* %out
70  ret void
71}
72
73; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
74define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
75  %ext = fpext <2 x half> %arg to <2 x float>
76  store <2 x float> %ext, <2 x float> addrspace(1)* %out
77  ret void
78}
79
80; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
81; GCN: buffer_load_ushort
82; GCN: buffer_load_ushort
83; GCN: buffer_load_ushort
84; GCN-NOT: buffer_load
85; GCN: v_cvt_f32_f16_e32
86; GCN: v_cvt_f32_f16_e32
87; GCN: v_cvt_f32_f16_e32
88; GCN-NOT: v_cvt_f32_f16
89; GCN-DAG: buffer_store_dword
90; GCN-DAG: buffer_store_dwordx2
91; GCN: s_endpgm
92define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
93  %ext = fpext <3 x half> %arg to <3 x float>
94  store <3 x float> %ext, <3 x float> addrspace(1)* %out
95  ret void
96}
97
98; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
99define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
100  %ext = fpext <4 x half> %arg to <4 x float>
101  store <4 x float> %ext, <4 x float> addrspace(1)* %out
102  ret void
103}
104
105; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
106; GCN: buffer_load_ushort
107; GCN: buffer_load_ushort
108; GCN: buffer_load_ushort
109; GCN: buffer_load_ushort
110; GCN: buffer_load_ushort
111; GCN: buffer_load_ushort
112; GCN: buffer_load_ushort
113; GCN: buffer_load_ushort
114
115; GCN: v_cvt_f32_f16_e32
116; GCN: v_cvt_f32_f16_e32
117; GCN: v_cvt_f32_f16_e32
118; GCN: v_cvt_f32_f16_e32
119; GCN: v_cvt_f32_f16_e32
120; GCN: v_cvt_f32_f16_e32
121; GCN: v_cvt_f32_f16_e32
122; GCN: v_cvt_f32_f16_e32
123
124; GCN: buffer_store_dwordx4
125; GCN: buffer_store_dwordx4
126define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
127  %ext = fpext <8 x half> %arg to <8 x float>
128  store <8 x float> %ext, <8 x float> addrspace(1)* %out
129  ret void
130}
131
132; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
133; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
134; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}}
135; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]]
136; GCN: buffer_store_dwordx2 [[RESULT]]
137define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
138  %ext = fpext half %arg to double
139  store double %ext, double addrspace(1)* %out
140  ret void
141}
142
143; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
144; GCN-DAG: buffer_load_ushort v
145; GCN-DAG: buffer_load_ushort v
146; GCN-DAG: v_cvt_f32_f16_e32
147; GCN-DAG: v_cvt_f32_f16_e32
148; GCN-DAG: v_cvt_f64_f32_e32
149; GCN-DAG: v_cvt_f64_f32_e32
150; GCN: s_endpgm
151define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
152  %ext = fpext <2 x half> %arg to <2 x double>
153  store <2 x double> %ext, <2 x double> addrspace(1)* %out
154  ret void
155}
156
157; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
158; GCN-DAG: buffer_load_ushort v
159; GCN-DAG: buffer_load_ushort v
160; GCN-DAG: buffer_load_ushort v
161; GCN-DAG: v_cvt_f32_f16_e32
162; GCN-DAG: v_cvt_f32_f16_e32
163; GCN-DAG: v_cvt_f32_f16_e32
164; GCN-DAG: v_cvt_f64_f32_e32
165; GCN-DAG: v_cvt_f64_f32_e32
166; GCN-DAG: v_cvt_f64_f32_e32
167; GCN: s_endpgm
168define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
169  %ext = fpext <3 x half> %arg to <3 x double>
170  store <3 x double> %ext, <3 x double> addrspace(1)* %out
171  ret void
172}
173
174; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
175; GCN-DAG: buffer_load_ushort v
176; GCN-DAG: buffer_load_ushort v
177; GCN-DAG: buffer_load_ushort v
178; GCN-DAG: buffer_load_ushort v
179; GCN-DAG: v_cvt_f32_f16_e32
180; GCN-DAG: v_cvt_f32_f16_e32
181; GCN-DAG: v_cvt_f32_f16_e32
182; GCN-DAG: v_cvt_f32_f16_e32
183; GCN-DAG: v_cvt_f64_f32_e32
184; GCN-DAG: v_cvt_f64_f32_e32
185; GCN-DAG: v_cvt_f64_f32_e32
186; GCN-DAG: v_cvt_f64_f32_e32
187; GCN: s_endpgm
188define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
189  %ext = fpext <4 x half> %arg to <4 x double>
190  store <4 x double> %ext, <4 x double> addrspace(1)* %out
191  ret void
192}
193
194; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
195; GCN-DAG: buffer_load_ushort v
196; GCN-DAG: buffer_load_ushort v
197; GCN-DAG: buffer_load_ushort v
198; GCN-DAG: buffer_load_ushort v
199
200; GCN-DAG: buffer_load_ushort v
201; GCN-DAG: buffer_load_ushort v
202; GCN-DAG: buffer_load_ushort v
203; GCN-DAG: buffer_load_ushort v
204
205; GCN-DAG: v_cvt_f32_f16_e32
206; GCN-DAG: v_cvt_f32_f16_e32
207; GCN-DAG: v_cvt_f32_f16_e32
208; GCN-DAG: v_cvt_f32_f16_e32
209
210; GCN-DAG: v_cvt_f32_f16_e32
211; GCN-DAG: v_cvt_f32_f16_e32
212; GCN-DAG: v_cvt_f32_f16_e32
213; GCN-DAG: v_cvt_f32_f16_e32
214
215; GCN-DAG: v_cvt_f64_f32_e32
216; GCN-DAG: v_cvt_f64_f32_e32
217; GCN-DAG: v_cvt_f64_f32_e32
218; GCN-DAG: v_cvt_f64_f32_e32
219
220; GCN-DAG: v_cvt_f64_f32_e32
221; GCN-DAG: v_cvt_f64_f32_e32
222; GCN-DAG: v_cvt_f64_f32_e32
223; GCN-DAG: v_cvt_f64_f32_e32
224
225; GCN: s_endpgm
226define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
227  %ext = fpext <8 x half> %arg to <8 x double>
228  store <8 x double> %ext, <8 x double> addrspace(1)* %out
229  ret void
230}
231
232; GCN-LABEL: {{^}}global_load_store_f16:
233; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
234; GCN: buffer_store_short [[TMP]]
235define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
236  %val = load half, half addrspace(1)* %in
237  store half %val, half addrspace(1)* %out
238  ret void
239}
240
241; GCN-LABEL: {{^}}global_load_store_v2f16:
242; GCN: buffer_load_dword [[TMP:v[0-9]+]]
243; GCN: buffer_store_dword [[TMP]]
244define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
245  %val = load <2 x half>, <2 x half> addrspace(1)* %in
246  store <2 x half> %val, <2 x half> addrspace(1)* %out
247  ret void
248}
249
250; GCN-LABEL: {{^}}global_load_store_v4f16:
251; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
252; GCN: buffer_store_dwordx2 [[TMP]]
253define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
254  %val = load <4 x half>, <4 x half> addrspace(1)* %in
255  store <4 x half> %val, <4 x half> addrspace(1)* %out
256  ret void
257}
258
259; GCN-LABEL: {{^}}global_load_store_v8f16:
260; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
261; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
262; GCN: s_endpgm
263define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
264  %val = load <8 x half>, <8 x half> addrspace(1)* %in
265  store <8 x half> %val, <8 x half> addrspace(1)* %out
266  ret void
267}
268
269; GCN-LABEL: {{^}}global_extload_f16_to_f32:
270; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
271; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
272; GCN: buffer_store_dword [[CVT]]
273define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
274  %val = load half, half addrspace(1)* %in
275  %cvt = fpext half %val to float
276  store float %cvt, float addrspace(1)* %out
277  ret void
278}
279
280; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
281; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
282; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
283; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
284; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
285; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
286; GCN: s_endpgm
287define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
288  %val = load <2 x half>, <2 x half> addrspace(1)* %in
289  %cvt = fpext <2 x half> %val to <2 x float>
290  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
291  ret void
292}
293
294; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
295define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
296  %val = load <3 x half>, <3 x half> addrspace(1)* %in
297  %cvt = fpext <3 x half> %val to <3 x float>
298  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
299  ret void
300}
301
302; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
303define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
304  %val = load <4 x half>, <4 x half> addrspace(1)* %in
305  %cvt = fpext <4 x half> %val to <4 x float>
306  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
307  ret void
308}
309
310; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
311define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
312  %val = load <8 x half>, <8 x half> addrspace(1)* %in
313  %cvt = fpext <8 x half> %val to <8 x float>
314  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
315  ret void
316}
317
318; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
319; GCN: buffer_load_dwordx4
320; GCN: buffer_load_dwordx4
321
322; GCN: v_cvt_f32_f16_e32
323; GCN: v_cvt_f32_f16_e32
324; GCN: v_cvt_f32_f16_e32
325; GCN: v_cvt_f32_f16_e32
326; GCN: v_cvt_f32_f16_e32
327; GCN: v_cvt_f32_f16_e32
328; GCN: v_cvt_f32_f16_e32
329; GCN: v_cvt_f32_f16_e32
330; GCN: v_cvt_f32_f16_e32
331; GCN: v_cvt_f32_f16_e32
332; GCN: v_cvt_f32_f16_e32
333; GCN: v_cvt_f32_f16_e32
334; GCN: v_cvt_f32_f16_e32
335; GCN: v_cvt_f32_f16_e32
336; GCN: v_cvt_f32_f16_e32
337; GCN: v_cvt_f32_f16_e32
338
339; GCN: buffer_store_dwordx4
340; GCN: buffer_store_dwordx4
341; GCN: buffer_store_dwordx4
342; GCN: buffer_store_dwordx4
343
344; GCN: s_endpgm
345define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
346  %val = load <16 x half>, <16 x half> addrspace(1)* %in
347  %cvt = fpext <16 x half> %val to <16 x float>
348  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
349  ret void
350}
351
352; GCN-LABEL: {{^}}global_extload_f16_to_f64:
353; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
354; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
355; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
356; GCN: buffer_store_dwordx2 [[CVT1]]
357define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
358  %val = load half, half addrspace(1)* %in
359  %cvt = fpext half %val to double
360  store double %cvt, double addrspace(1)* %out
361  ret void
362}
363
364; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
365; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
366; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
367; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
368; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
369; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
370; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
371; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
372; GCN: s_endpgm
373define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
374  %val = load <2 x half>, <2 x half> addrspace(1)* %in
375  %cvt = fpext <2 x half> %val to <2 x double>
376  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
377  ret void
378}
379
380; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
381
382; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
383; GCN-DAG: v_cvt_f32_f16_e32
384; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
385; GCN-DAG: v_cvt_f32_f16_e32
386; GCN-DAG: v_cvt_f32_f16_e32
387
388; GCN: v_cvt_f64_f32_e32
389; GCN: v_cvt_f64_f32_e32
390; GCN: v_cvt_f64_f32_e32
391; GCN-NOT: v_cvt_f64_f32_e32
392
393; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
394; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
395; GCN: s_endpgm
396define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
397  %val = load <3 x half>, <3 x half> addrspace(1)* %in
398  %cvt = fpext <3 x half> %val to <3 x double>
399  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
400  ret void
401}
402
403; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
404define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
405  %val = load <4 x half>, <4 x half> addrspace(1)* %in
406  %cvt = fpext <4 x half> %val to <4 x double>
407  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
408  ret void
409}
410
411; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
412define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
413  %val = load <8 x half>, <8 x half> addrspace(1)* %in
414  %cvt = fpext <8 x half> %val to <8 x double>
415  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
416  ret void
417}
418
419; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
420define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
421  %val = load <16 x half>, <16 x half> addrspace(1)* %in
422  %cvt = fpext <16 x half> %val to <16 x double>
423  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
424  ret void
425}
426
427; GCN-LABEL: {{^}}global_truncstore_f32_to_f16:
428; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
429; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
430; GCN: buffer_store_short [[CVT]]
431define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
432  %val = load float, float addrspace(1)* %in
433  %cvt = fptrunc float %val to half
434  store half %cvt, half addrspace(1)* %out
435  ret void
436}
437
438; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
439; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
440; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
441; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
442; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
443; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
444; GCN-DAG: buffer_store_dword [[PACKED]]
445; GCN: s_endpgm
446define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
447  %val = load <2 x float>, <2 x float> addrspace(1)* %in
448  %cvt = fptrunc <2 x float> %val to <2 x half>
449  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
450  ret void
451}
452
453; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
454; GCN: buffer_load_dwordx4
455; GCN: v_cvt_f16_f32_e32
456; GCN: v_cvt_f16_f32_e32
457; GCN: v_cvt_f16_f32_e32
458; GCN-NOT: v_cvt_f16_f32_e32
459; GCN: buffer_store_short
460; GCN: buffer_store_dword
461; GCN: s_endpgm
462define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
463  %val = load <3 x float>, <3 x float> addrspace(1)* %in
464  %cvt = fptrunc <3 x float> %val to <3 x half>
465  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
466  ret void
467}
468
469; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
470; GCN: buffer_load_dwordx4
471; GCN: v_cvt_f16_f32_e32
472; GCN: v_cvt_f16_f32_e32
473; GCN: v_cvt_f16_f32_e32
474; GCN: v_cvt_f16_f32_e32
475; GCN: buffer_store_dwordx2
476; GCN: s_endpgm
477define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
478  %val = load <4 x float>, <4 x float> addrspace(1)* %in
479  %cvt = fptrunc <4 x float> %val to <4 x half>
480  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
481  ret void
482}
483
484; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
485; GCN: buffer_load_dwordx4
486; GCN: buffer_load_dwordx4
487; GCN: v_cvt_f16_f32_e32
488; GCN: v_cvt_f16_f32_e32
489; GCN: v_cvt_f16_f32_e32
490; GCN: v_cvt_f16_f32_e32
491; GCN: v_cvt_f16_f32_e32
492; GCN: v_cvt_f16_f32_e32
493; GCN: v_cvt_f16_f32_e32
494; GCN: v_cvt_f16_f32_e32
495; GCN: buffer_store_dwordx4
496; GCN: s_endpgm
497define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
498  %val = load <8 x float>, <8 x float> addrspace(1)* %in
499  %cvt = fptrunc <8 x float> %val to <8 x half>
500  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
501  ret void
502}
503
504; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
505; GCN: buffer_load_dwordx4
506; GCN: buffer_load_dwordx4
507; GCN: buffer_load_dwordx4
508; GCN: buffer_load_dwordx4
509; GCN-DAG: v_cvt_f16_f32_e32
510; GCN-DAG: v_cvt_f16_f32_e32
511; GCN-DAG: v_cvt_f16_f32_e32
512; GCN-DAG: v_cvt_f16_f32_e32
513; GCN-DAG: v_cvt_f16_f32_e32
514; GCN-DAG: v_cvt_f16_f32_e32
515; GCN-DAG: v_cvt_f16_f32_e32
516; GCN-DAG: v_cvt_f16_f32_e32
517; GCN-DAG: v_cvt_f16_f32_e32
518; GCN-DAG: v_cvt_f16_f32_e32
519; GCN-DAG: v_cvt_f16_f32_e32
520; GCN-DAG: v_cvt_f16_f32_e32
521; GCN-DAG: v_cvt_f16_f32_e32
522; GCN-DAG: v_cvt_f16_f32_e32
523; GCN-DAG: v_cvt_f16_f32_e32
524; GCN-DAG: v_cvt_f16_f32_e32
525; GCN-DAG: buffer_store_dwordx4
526; GCN-DAG: buffer_store_dwordx4
527; GCN: s_endpgm
528define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
529  %val = load <16 x float>, <16 x float> addrspace(1)* %in
530  %cvt = fptrunc <16 x float> %val to <16 x half>
531  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
532  ret void
533}
534
535; FIXME: Unsafe math should fold conversions away
536; GCN-LABEL: {{^}}fadd_f16:
537; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
538; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
539; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
540; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
541; SI: v_add_f32
542; GCN: s_endpgm
543define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
544   %add = fadd half %a, %b
545   store half %add, half addrspace(1)* %out, align 4
546   ret void
547}
548
549; GCN-LABEL: {{^}}fadd_v2f16:
550; SI: v_add_f32
551; SI: v_add_f32
552; GCN: s_endpgm
553define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
554  %add = fadd <2 x half> %a, %b
555  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
556  ret void
557}
558
559; GCN-LABEL: {{^}}fadd_v4f16:
560; SI: v_add_f32
561; SI: v_add_f32
562; SI: v_add_f32
563; SI: v_add_f32
564; GCN: s_endpgm
565define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
566  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
567  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
568  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
569  %result = fadd <4 x half> %a, %b
570  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
571  ret void
572}
573
574; GCN-LABEL: {{^}}fadd_v8f16:
575; SI: v_add_f32
576; SI: v_add_f32
577; SI: v_add_f32
578; SI: v_add_f32
579; SI: v_add_f32
580; SI: v_add_f32
581; SI: v_add_f32
582; SI: v_add_f32
583; GCN: s_endpgm
584define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
585  %add = fadd <8 x half> %a, %b
586  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
587  ret void
588}
589
590; GCN-LABEL: {{^}}fsub_f16:
591; GCN: v_subrev_f32_e32
592; GCN: s_endpgm
593define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
594  %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1
595  %a = load half, half addrspace(1)* %in
596  %b = load half, half addrspace(1)* %b_ptr
597  %sub = fsub half %a, %b
598  store half %sub, half addrspace(1)* %out
599  ret void
600}
601
602; GCN-LABEL: {{^}}test_bitcast_from_half:
603; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
604; GCN: buffer_store_short [[TMP]]
605define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
606  %val = load half, half addrspace(1)* %in
607  %val_int = bitcast half %val to i16
608  store i16 %val_int, i16 addrspace(1)* %out
609  ret void
610}
611
612; GCN-LABEL: {{^}}test_bitcast_to_half:
613; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
614; GCN: buffer_store_short [[TMP]]
615define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
616  %val = load i16, i16 addrspace(1)* %in
617  %val_fp = bitcast i16 %val to half
618  store half %val_fp, half addrspace(1)* %out
619  ret void
620}
621
622attributes #0 = { nounwind }
623