1; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
2
3; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
4; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
5; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
6; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
7
8; GCN-NOT: pack
9; GCN-NOT: and
10; GCN-NOT: shl
11; GCN-NOT: or
12
13; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
14define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
15bb:
16  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
17
18  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
19  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
20  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
21
22  %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
23  %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
24
25  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
26  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
27  ret void
28}
29
30; Apply fneg to broadcasted vector
31; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
32; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
33; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
34; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
35
36; GCN-NOT: pack
37; GCN-NOT: and
38; GCN-NOT: shl
39; GCN-NOT: or
40
41; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
42define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
43bb:
44  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
45
46  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
47  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
48  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
49
50  %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
51  %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
52  %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast
53
54  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
55  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
56  ret void
57}
58
59; Apply fneg before broadcast
60; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
61; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
62; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
63; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
64
65; GCN-NOT: pack
66; GCN-NOT: and
67; GCN-NOT: shl
68; GCN-NOT: or
69
70; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
71define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
72bb:
73  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
74
75  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
76  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
77  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
78
79  %neg.scalar0 = fsub half -0.0, %scalar0
80  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
81  %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
82
83  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
84  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
85  ret void
86}
87
88; Apply fneg before and after broadcast, and should cancel out.
89; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
90; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
91; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
92; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
93
94; GCN-NOT: pack
95; GCN-NOT: and
96; GCN-NOT: shl
97; GCN-NOT: or
98
99; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
100define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
101bb:
102  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
103
104  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
105  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
106  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
107
108  %neg.scalar0 = fsub half -0.0, %scalar0
109  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
110  %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
111  %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast
112
113  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
114  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
115  ret void
116}
117
118; Add scalar, but negate low component
119; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
120; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
121; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
122; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
123
124; GCN-NOT: pack
125; GCN-NOT: and
126; GCN-NOT: shl
127; GCN-NOT: or
128
129; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
130define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
131bb:
132  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
133
134  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
135  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
136  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
137
138  %neg.scalar0 = fsub half -0.0, %scalar0
139  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
140  %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
141  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
142  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
143  ret void
144}
145
146; Add scalar, but negate high component
147; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
148; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
149; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
150; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
151
152; GCN-NOT: pack
153; GCN-NOT: and
154; GCN-NOT: shl
155; GCN-NOT: or
156
157; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
158define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
159bb:
160  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
161
162  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
163  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
164  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
165
166  %neg.scalar0 = fsub half -0.0, %scalar0
167  %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
168  %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
169  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
170  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
171  ret void
172}
173
174; Apply fneg before broadcast with bitcast
175; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
176; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
177; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
178
179; GCN-NOT: pack
180; GCN-NOT: and
181; GCN-NOT: shl
182; GCN-NOT: or
183
184; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
185define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
186bb:
187  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
188  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
189  %neg.scalar0 = fsub half -0.0, %scalar0
190  %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
191
192  %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
193  %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
194
195  %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
196  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
197  ret void
198}
199
200; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
201; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
202; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
203; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
204; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
205
206; FIXME: Remove and
207; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
208; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
209; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
210
211; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
212define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
213bb:
214  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
215  %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
216
217  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
218  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
219
220  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
221  %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
222
223  %neg.scalar1 = fsub half -0.0, %scalar1
224  %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
225  %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
226  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
227  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
228  ret void
229}
230
231; FIXME: Can we avoid waitcnt between the two halves?
232; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
233; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
234; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
235; GCN: ds_read_u16 [[PACKED:v[0-9]+]]
236; GCN: s_waitcnt
237; GCN: ds_read_u16_d16_hi [[PACKED]]
238
239; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
240define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
241bb:
242  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
243  %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
244
245  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
246  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
247
248  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
249  %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
250
251  %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
252  %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
253  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
254
255  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
256  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
257  ret void
258}
259
260; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi:
261; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
262; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
263; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
264
265; GCN-NOT: pack
266; GCN-NOT: and
267; GCN-NOT: shl
268; GCN-NOT: or
269
270; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
271define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
272bb:
273  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
274  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
275
276  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
277  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
278  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
279
280  %vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2
281  %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1>
282
283  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
284  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
285  ret void
286}
287
288; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi:
289; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
290; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
291; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
292
293; GCN-NOT: pack
294; GCN-NOT: and
295; GCN-NOT: shl
296; GCN-NOT: or
297
298; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
299define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
300bb:
301  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
302  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
303
304  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
305  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
306  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
307
308  %vec2.elt1 = extractelement <2 x half> %vec2, i32 1
309  %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1
310
311  %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
312  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
313  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
314  ret void
315}
316
317; GCN-LABEL: {{^}}add_vector_scalar_hi:
318; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
319; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
320
321; GCN-NOT: pack
322; GCN-NOT: and
323; GCN-NOT: shl
324; GCN-NOT: or
325
326; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
327define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
328bb:
329  %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
330
331  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
332  %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
333
334  %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
335  %result = add <2 x i16> %vec0, %vec1.elt1.broadcast
336
337  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
338  ret void
339}
340
341; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi:
342; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
343; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
344; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
345
346; GCN-NOT: pack
347; GCN-NOT: and
348; GCN-NOT: shl
349; GCN-NOT: or
350
351; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
352define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
353bb:
354  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
355  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
356
357  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
358  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
359  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
360
361  %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1>
362
363  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
364
365  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
366  ret void
367}
368
369; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi:
370; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
371; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
372; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
373
374; GCN-NOT: pack
375; GCN-NOT: and
376; GCN-NOT: shl
377; GCN-NOT: or
378
379; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
380define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
381bb:
382  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
383  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
384
385  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
386  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
387  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
388
389  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
390  %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
391  %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1
392  %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
393
394  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
395  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
396  ret void
397}
398
399; GCN-LABEL: {{^}}fma_vector_vector_swap_vector:
400; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
401; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
402; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
403
404; GCN-NOT: pack
405; GCN-NOT: and
406; GCN-NOT: shl
407; GCN-NOT: or
408
409; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
410define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
411bb:
412  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
413  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
414
415  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
416  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
417  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
418
419  %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
420  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
421
422  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
423  ret void
424}
425
426; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector:
427; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
428; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
429; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
430
431; GCN-NOT: pack
432; GCN-NOT: and
433; GCN-NOT: shl
434; GCN-NOT: or
435; GCN-NOT: xor
436
437; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
438define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
439bb:
440  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
441  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
442
443  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
444  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
445  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
446  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
447
448  %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
449  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
450
451  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
452  ret void
453}
454
455; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0:
456; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
457; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
458; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
459
460; GCN-NOT: pack
461; GCN-NOT: and
462; GCN-NOT: shl
463; GCN-NOT: or
464; GCN-NOT: xor
465
466; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
467define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
468bb:
469  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
470  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
471
472  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
473  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
474  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
475  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
476  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0>
477  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
478
479  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
480  ret void
481}
482
483; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1:
484; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
485; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
486; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
487
488; GCN-NOT: pack
489; GCN-NOT: and
490; GCN-NOT: shl
491; GCN-NOT: or
492; GCN-NOT: xor
493
494; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
495define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
496bb:
497  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
498  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
499
500  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
501  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
502  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
503  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
504  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1>
505  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
506
507  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
508  ret void
509}
510
511; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2:
512; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
513; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
514; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
515
516; GCN-NOT: pack
517; GCN-NOT: and
518; GCN-NOT: shl
519; GCN-NOT: or
520; GCN-NOT: xor
521
522; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
523define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
524bb:
525  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
526  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
527
528  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
529  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
530  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
531  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
532  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3>
533  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
534
535  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
536  ret void
537}
538
539; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3:
540; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
541; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
542; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
543
544; GCN-NOT: pack
545; GCN-NOT: and
546; GCN-NOT: shl
547; GCN-NOT: or
548; GCN-NOT: xor
549
550; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
551define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
552bb:
553  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
554  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
555
556  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
557  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
558  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
559  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
560  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1>
561  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
562
563  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
564  ret void
565}
566
567; GCN-LABEL: {{^}}bitcast_fneg_f32:
568; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
569define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
570bb:
571  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
572  %f32 = load volatile float, float addrspace(3)* undef, align 4
573  %neg.f32 = fsub float -0.0, %f32
574  %bc = bitcast float %neg.f32 to <2 x half>
575  %result = fadd <2 x half> %vec0, %bc
576
577  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
578  ret void
579}
580
581; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
582; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
583define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
584bb:
585  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
586
587  %f32 = load volatile float, float addrspace(3)* undef, align 4
588  %neg.f32 = fsub float -0.0, %f32
589  %bc = bitcast float %neg.f32 to <2 x half>
590  %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0>
591  %result = fadd <2 x half> %vec0, %shuf
592  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
593  ret void
594}
595
596; GCN-LABEL: {{^}}extract_from_i64:
597; GCN: v_lshl_or_b32
598; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
599define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
600bb:
601  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
602  %i64 = load volatile i64, i64 addrspace(1)* undef
603
604  %elt0 = trunc i64 %i64 to i16
605  %hi = lshr i64 %i64, 16
606  %elt1 = trunc i64 %hi to i16
607
608  %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0
609  %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1
610  %result = add <2 x i16> %vec0, %ins1
611  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
612  ret void
613}
614
615
616; Bitcast is final obstacle to identifying same source register
617; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel:
618; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
619; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
620; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
621
622; GCN-NOT: pack
623; GCN-NOT: and
624; GCN-NOT: shl
625; GCN-NOT: _or
626
627; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
628; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
629define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
630bb:
631  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
632  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
633
634  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
635  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
636  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
637
638  %scalar0 = load volatile i16, i16 addrspace(1)* undef
639  %shl = shl i16 %scalar0, 1
640  %shl.bc = bitcast i16 %shl to half
641
642  %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
643  %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0>
644
645  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
646  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
647  ret void
648}
649
650
651; Bitcast is final obstacle to identifying same source register
652; GCN-LABEL: {{^}}mix_elt_types_op_sel:
653; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
654; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
655; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
656
657; GCN-NOT: pack
658; GCN-NOT: and
659; GCN-NOT: shl
660; GCN-NOT: _or
661
662; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
663; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
664define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
665bb:
666  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
667  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
668
669  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
670  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
671  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
672
673  %scalar0 = load volatile i16, i16 addrspace(1)* undef
674  %scalar1 = load volatile half, half addrspace(1)* undef
675  %shl = shl i16 %scalar0, 1
676  %shl.bc = bitcast i16 %shl to half
677
678  %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0
679
680  %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
681  %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0>
682
683  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
684  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
685  ret void
686}
687
688declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
689
690attributes #0 = { nounwind }
691attributes #1 = { nounwind readnone }
692