1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
4
5define amdgpu_kernel void @madak_f16(
6; SI-LABEL: madak_f16:
7; SI:       ; %bb.0: ; %entry
8; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
9; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
10; SI-NEXT:    s_mov_b32 s3, 0xf000
11; SI-NEXT:    s_mov_b32 s2, -1
12; SI-NEXT:    s_mov_b32 s10, s2
13; SI-NEXT:    s_mov_b32 s11, s3
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_mov_b32 s12, s6
16; SI-NEXT:    s_mov_b32 s13, s7
17; SI-NEXT:    s_mov_b32 s14, s2
18; SI-NEXT:    s_mov_b32 s15, s3
19; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
20; SI-NEXT:    buffer_load_ushort v1, off, s[12:15], 0
21; SI-NEXT:    s_mov_b32 s0, s4
22; SI-NEXT:    s_mov_b32 s1, s5
23; SI-NEXT:    s_waitcnt vmcnt(1)
24; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
25; SI-NEXT:    s_waitcnt vmcnt(0)
26; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
27; SI-NEXT:    v_madak_f32 v0, v1, v0, 0x41200000
28; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
29; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
30; SI-NEXT:    s_endpgm
31;
32; VI-LABEL: madak_f16:
33; VI:       ; %bb.0: ; %entry
34; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
35; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
36; VI-NEXT:    s_mov_b32 s3, 0xf000
37; VI-NEXT:    s_mov_b32 s2, -1
38; VI-NEXT:    s_mov_b32 s10, s2
39; VI-NEXT:    s_waitcnt lgkmcnt(0)
40; VI-NEXT:    s_mov_b32 s0, s4
41; VI-NEXT:    s_mov_b32 s1, s5
42; VI-NEXT:    s_mov_b32 s4, s6
43; VI-NEXT:    s_mov_b32 s5, s7
44; VI-NEXT:    s_mov_b32 s6, s2
45; VI-NEXT:    s_mov_b32 s7, s3
46; VI-NEXT:    s_mov_b32 s11, s3
47; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
48; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
49; VI-NEXT:    s_waitcnt vmcnt(0)
50; VI-NEXT:    v_madak_f16 v0, v0, v1, 0x4900
51; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
52; VI-NEXT:    s_endpgm
53    half addrspace(1)* %r,
54    half addrspace(1)* %a,
55    half addrspace(1)* %b) #0 {
56entry:
57  %a.val = load half, half addrspace(1)* %a
58  %b.val = load half, half addrspace(1)* %b
59
60  %t.val = fmul half %a.val, %b.val
61  %r.val = fadd half %t.val, 10.0
62
63  store half %r.val, half addrspace(1)* %r
64  ret void
65}
66
67define amdgpu_kernel void @madak_f16_use_2(
68; SI-LABEL: madak_f16_use_2:
69; SI:       ; %bb.0: ; %entry
70; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
71; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
72; SI-NEXT:    s_mov_b32 s3, 0xf000
73; SI-NEXT:    s_mov_b32 s2, -1
74; SI-NEXT:    s_mov_b32 s18, s2
75; SI-NEXT:    s_waitcnt lgkmcnt(0)
76; SI-NEXT:    s_mov_b32 s16, s8
77; SI-NEXT:    s_mov_b32 s17, s9
78; SI-NEXT:    s_mov_b32 s19, s3
79; SI-NEXT:    s_mov_b32 s8, s10
80; SI-NEXT:    s_mov_b32 s9, s11
81; SI-NEXT:    s_mov_b32 s10, s2
82; SI-NEXT:    s_mov_b32 s11, s3
83; SI-NEXT:    s_mov_b32 s14, s2
84; SI-NEXT:    s_mov_b32 s15, s3
85; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
86; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
87; SI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0
88; SI-NEXT:    v_mov_b32_e32 v3, 0x41200000
89; SI-NEXT:    s_mov_b32 s0, s4
90; SI-NEXT:    s_mov_b32 s1, s5
91; SI-NEXT:    s_mov_b32 s8, s6
92; SI-NEXT:    s_mov_b32 s9, s7
93; SI-NEXT:    s_waitcnt vmcnt(2)
94; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
95; SI-NEXT:    s_waitcnt vmcnt(1)
96; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
97; SI-NEXT:    s_waitcnt vmcnt(0)
98; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
99; SI-NEXT:    v_madak_f32 v1, v0, v1, 0x41200000
100; SI-NEXT:    v_mac_f32_e32 v3, v0, v2
101; SI-NEXT:    v_cvt_f16_f32_e32 v0, v1
102; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
103; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
104; SI-NEXT:    buffer_store_short v1, off, s[8:11], 0
105; SI-NEXT:    s_endpgm
106;
107; VI-LABEL: madak_f16_use_2:
108; VI:       ; %bb.0: ; %entry
109; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
110; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
111; VI-NEXT:    s_mov_b32 s3, 0xf000
112; VI-NEXT:    s_mov_b32 s2, -1
113; VI-NEXT:    s_mov_b32 s18, s2
114; VI-NEXT:    s_waitcnt lgkmcnt(0)
115; VI-NEXT:    s_mov_b32 s16, s8
116; VI-NEXT:    s_mov_b32 s17, s9
117; VI-NEXT:    s_mov_b32 s19, s3
118; VI-NEXT:    s_mov_b32 s8, s10
119; VI-NEXT:    s_mov_b32 s9, s11
120; VI-NEXT:    s_mov_b32 s10, s2
121; VI-NEXT:    s_mov_b32 s11, s3
122; VI-NEXT:    s_mov_b32 s14, s2
123; VI-NEXT:    s_mov_b32 s15, s3
124; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
125; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
126; VI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0
127; VI-NEXT:    v_mov_b32_e32 v3, 0x4900
128; VI-NEXT:    s_mov_b32 s0, s4
129; VI-NEXT:    s_mov_b32 s1, s5
130; VI-NEXT:    s_mov_b32 s8, s6
131; VI-NEXT:    s_mov_b32 s9, s7
132; VI-NEXT:    s_waitcnt vmcnt(1)
133; VI-NEXT:    v_madak_f16 v1, v0, v1, 0x4900
134; VI-NEXT:    s_waitcnt vmcnt(0)
135; VI-NEXT:    v_mac_f16_e32 v3, v0, v2
136; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0
137; VI-NEXT:    buffer_store_short v3, off, s[8:11], 0
138; VI-NEXT:    s_endpgm
139    half addrspace(1)* %r0,
140    half addrspace(1)* %r1,
141    half addrspace(1)* %a,
142    half addrspace(1)* %b,
143    half addrspace(1)* %c) #0 {
144entry:
145  %a.val = load volatile half, half addrspace(1)* %a
146  %b.val = load volatile half, half addrspace(1)* %b
147  %c.val = load volatile half, half addrspace(1)* %c
148
149  %t0.val = fmul half %a.val, %b.val
150  %t1.val = fmul half %a.val, %c.val
151  %r0.val = fadd half %t0.val, 10.0
152  %r1.val = fadd half %t1.val, 10.0
153
154  store half %r0.val, half addrspace(1)* %r0
155  store half %r1.val, half addrspace(1)* %r1
156  ret void
157}
158
159attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
160