1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic -mattr=+fullfp16 < %s | FileCheck %s
3; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic < %s | FileCheck %s --check-prefix=CHECKNOFP16
4
5define float @add_HalfS(<2 x float> %bin.rdx)  {
6; CHECK-LABEL: add_HalfS:
7; CHECK:       // %bb.0:
8; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
9; CHECK-NEXT:    faddp s0, v0.2s
10; CHECK-NEXT:    ret
11;
12; CHECKNOFP16-LABEL: add_HalfS:
13; CHECKNOFP16:       // %bb.0:
14; CHECKNOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
15; CHECKNOFP16-NEXT:    faddp s0, v0.2s
16; CHECKNOFP16-NEXT:    ret
17  %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
18  ret float %r
19}
20
21define half @add_HalfH(<4 x half> %bin.rdx)  {
22; CHECK-LABEL: add_HalfH:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
25; CHECK-NEXT:    mov h1, v0.h[3]
26; CHECK-NEXT:    mov h2, v0.h[2]
27; CHECK-NEXT:    faddp h0, v0.2h
28; CHECK-NEXT:    fadd h0, h0, h2
29; CHECK-NEXT:    fadd h0, h0, h1
30; CHECK-NEXT:    ret
31;
32; CHECKNOFP16-LABEL: add_HalfH:
33; CHECKNOFP16:       // %bb.0:
34; CHECKNOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
35; CHECKNOFP16-NEXT:    mov h3, v0.h[1]
36; CHECKNOFP16-NEXT:    mov h1, v0.h[3]
37; CHECKNOFP16-NEXT:    mov h2, v0.h[2]
38; CHECKNOFP16-NEXT:    fcvt s0, h0
39; CHECKNOFP16-NEXT:    fcvt s3, h3
40; CHECKNOFP16-NEXT:    fadd s0, s0, s3
41; CHECKNOFP16-NEXT:    fcvt h0, s0
42; CHECKNOFP16-NEXT:    fcvt s2, h2
43; CHECKNOFP16-NEXT:    fcvt s0, h0
44; CHECKNOFP16-NEXT:    fadd s0, s0, s2
45; CHECKNOFP16-NEXT:    fcvt h0, s0
46; CHECKNOFP16-NEXT:    fcvt s0, h0
47; CHECKNOFP16-NEXT:    fcvt s1, h1
48; CHECKNOFP16-NEXT:    fadd s0, s0, s1
49; CHECKNOFP16-NEXT:    fcvt h0, s0
50; CHECKNOFP16-NEXT:    ret
51  %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
52  ret half %r
53}
54
55
56define half @add_H(<8 x half> %bin.rdx)  {
57; CHECK-LABEL: add_H:
58; CHECK:       // %bb.0:
59; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
60; CHECK-NEXT:    fadd v0.4h, v0.4h, v1.4h
61; CHECK-NEXT:    mov h1, v0.h[2]
62; CHECK-NEXT:    faddp h2, v0.2h
63; CHECK-NEXT:    fadd h1, h2, h1
64; CHECK-NEXT:    mov h0, v0.h[3]
65; CHECK-NEXT:    fadd h0, h1, h0
66; CHECK-NEXT:    ret
67;
68; CHECKNOFP16-LABEL: add_H:
69; CHECKNOFP16:       // %bb.0:
70; CHECKNOFP16-NEXT:    mov h7, v0.h[1]
71; CHECKNOFP16-NEXT:    mov h1, v0.h[7]
72; CHECKNOFP16-NEXT:    mov h2, v0.h[6]
73; CHECKNOFP16-NEXT:    mov h3, v0.h[5]
74; CHECKNOFP16-NEXT:    mov h4, v0.h[4]
75; CHECKNOFP16-NEXT:    mov h5, v0.h[3]
76; CHECKNOFP16-NEXT:    mov h6, v0.h[2]
77; CHECKNOFP16-NEXT:    fcvt s0, h0
78; CHECKNOFP16-NEXT:    fcvt s7, h7
79; CHECKNOFP16-NEXT:    fadd s0, s0, s7
80; CHECKNOFP16-NEXT:    fcvt h0, s0
81; CHECKNOFP16-NEXT:    fcvt s6, h6
82; CHECKNOFP16-NEXT:    fcvt s0, h0
83; CHECKNOFP16-NEXT:    fadd s0, s0, s6
84; CHECKNOFP16-NEXT:    fcvt h0, s0
85; CHECKNOFP16-NEXT:    fcvt s5, h5
86; CHECKNOFP16-NEXT:    fcvt s0, h0
87; CHECKNOFP16-NEXT:    fadd s0, s0, s5
88; CHECKNOFP16-NEXT:    fcvt h0, s0
89; CHECKNOFP16-NEXT:    fcvt s4, h4
90; CHECKNOFP16-NEXT:    fcvt s0, h0
91; CHECKNOFP16-NEXT:    fadd s0, s0, s4
92; CHECKNOFP16-NEXT:    fcvt h0, s0
93; CHECKNOFP16-NEXT:    fcvt s3, h3
94; CHECKNOFP16-NEXT:    fcvt s0, h0
95; CHECKNOFP16-NEXT:    fadd s0, s0, s3
96; CHECKNOFP16-NEXT:    fcvt h0, s0
97; CHECKNOFP16-NEXT:    fcvt s2, h2
98; CHECKNOFP16-NEXT:    fcvt s0, h0
99; CHECKNOFP16-NEXT:    fadd s0, s0, s2
100; CHECKNOFP16-NEXT:    fcvt h0, s0
101; CHECKNOFP16-NEXT:    fcvt s0, h0
102; CHECKNOFP16-NEXT:    fcvt s1, h1
103; CHECKNOFP16-NEXT:    fadd s0, s0, s1
104; CHECKNOFP16-NEXT:    fcvt h0, s0
105; CHECKNOFP16-NEXT:    ret
106  %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
107  ret half %r
108}
109
110define float @add_S(<4 x float> %bin.rdx)  {
111; CHECK-LABEL: add_S:
112; CHECK:       // %bb.0:
113; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
114; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
115; CHECK-NEXT:    faddp s0, v0.2s
116; CHECK-NEXT:    ret
117;
118; CHECKNOFP16-LABEL: add_S:
119; CHECKNOFP16:       // %bb.0:
120; CHECKNOFP16-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
121; CHECKNOFP16-NEXT:    fadd v0.2s, v0.2s, v1.2s
122; CHECKNOFP16-NEXT:    faddp s0, v0.2s
123; CHECKNOFP16-NEXT:    ret
124  %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
125  ret float %r
126}
127
128define double @add_D(<2 x double> %bin.rdx)  {
129; CHECK-LABEL: add_D:
130; CHECK:       // %bb.0:
131; CHECK-NEXT:    faddp d0, v0.2d
132; CHECK-NEXT:    ret
133;
134; CHECKNOFP16-LABEL: add_D:
135; CHECKNOFP16:       // %bb.0:
136; CHECKNOFP16-NEXT:    faddp d0, v0.2d
137; CHECKNOFP16-NEXT:    ret
138  %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
139  ret double %r
140}
141
142define half @add_2H(<16 x half> %bin.rdx)  {
143; CHECK-LABEL: add_2H:
144; CHECK:       // %bb.0:
145; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
146; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
147; CHECK-NEXT:    fadd v0.4h, v0.4h, v1.4h
148; CHECK-NEXT:    mov h1, v0.h[2]
149; CHECK-NEXT:    faddp h2, v0.2h
150; CHECK-NEXT:    fadd h1, h2, h1
151; CHECK-NEXT:    mov h0, v0.h[3]
152; CHECK-NEXT:    fadd h0, h1, h0
153; CHECK-NEXT:    ret
154;
155; CHECKNOFP16-LABEL: add_2H:
156; CHECKNOFP16:       // %bb.0:
157; CHECKNOFP16-NEXT:    mov h2, v1.h[1]
158; CHECKNOFP16-NEXT:    mov h3, v0.h[1]
159; CHECKNOFP16-NEXT:    mov h6, v1.h[2]
160; CHECKNOFP16-NEXT:    mov h7, v0.h[2]
161; CHECKNOFP16-NEXT:    mov h16, v1.h[3]
162; CHECKNOFP16-NEXT:    mov h17, v0.h[3]
163; CHECKNOFP16-NEXT:    fcvt s4, h1
164; CHECKNOFP16-NEXT:    fcvt s5, h0
165; CHECKNOFP16-NEXT:    fcvt s2, h2
166; CHECKNOFP16-NEXT:    fcvt s3, h3
167; CHECKNOFP16-NEXT:    fcvt s6, h6
168; CHECKNOFP16-NEXT:    fcvt s7, h7
169; CHECKNOFP16-NEXT:    fcvt s16, h16
170; CHECKNOFP16-NEXT:    fcvt s17, h17
171; CHECKNOFP16-NEXT:    fadd s4, s5, s4
172; CHECKNOFP16-NEXT:    mov h5, v1.h[4]
173; CHECKNOFP16-NEXT:    fadd s2, s3, s2
174; CHECKNOFP16-NEXT:    mov h3, v0.h[4]
175; CHECKNOFP16-NEXT:    fadd s6, s7, s6
176; CHECKNOFP16-NEXT:    mov h7, v1.h[5]
177; CHECKNOFP16-NEXT:    fadd s16, s17, s16
178; CHECKNOFP16-NEXT:    mov h17, v0.h[5]
179; CHECKNOFP16-NEXT:    fcvt s5, h5
180; CHECKNOFP16-NEXT:    fcvt s3, h3
181; CHECKNOFP16-NEXT:    fcvt s7, h7
182; CHECKNOFP16-NEXT:    fcvt s17, h17
183; CHECKNOFP16-NEXT:    fadd s3, s3, s5
184; CHECKNOFP16-NEXT:    mov h5, v1.h[6]
185; CHECKNOFP16-NEXT:    fadd s7, s17, s7
186; CHECKNOFP16-NEXT:    mov h17, v0.h[6]
187; CHECKNOFP16-NEXT:    mov h1, v1.h[7]
188; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
189; CHECKNOFP16-NEXT:    fcvt s1, h1
190; CHECKNOFP16-NEXT:    fcvt s0, h0
191; CHECKNOFP16-NEXT:    fadd s0, s0, s1
192; CHECKNOFP16-NEXT:    fcvt h1, s4
193; CHECKNOFP16-NEXT:    fcvt h2, s2
194; CHECKNOFP16-NEXT:    fcvt s1, h1
195; CHECKNOFP16-NEXT:    fcvt s2, h2
196; CHECKNOFP16-NEXT:    fadd s1, s1, s2
197; CHECKNOFP16-NEXT:    fcvt h2, s6
198; CHECKNOFP16-NEXT:    fcvt h1, s1
199; CHECKNOFP16-NEXT:    fcvt s2, h2
200; CHECKNOFP16-NEXT:    fcvt s1, h1
201; CHECKNOFP16-NEXT:    fadd s1, s1, s2
202; CHECKNOFP16-NEXT:    fcvt h2, s16
203; CHECKNOFP16-NEXT:    fcvt h1, s1
204; CHECKNOFP16-NEXT:    fcvt s2, h2
205; CHECKNOFP16-NEXT:    fcvt s1, h1
206; CHECKNOFP16-NEXT:    fadd s1, s1, s2
207; CHECKNOFP16-NEXT:    fcvt h2, s3
208; CHECKNOFP16-NEXT:    fcvt h1, s1
209; CHECKNOFP16-NEXT:    fcvt s2, h2
210; CHECKNOFP16-NEXT:    fcvt s1, h1
211; CHECKNOFP16-NEXT:    fadd s1, s1, s2
212; CHECKNOFP16-NEXT:    fcvt h3, s7
213; CHECKNOFP16-NEXT:    fcvt h1, s1
214; CHECKNOFP16-NEXT:    fcvt s5, h5
215; CHECKNOFP16-NEXT:    fcvt s17, h17
216; CHECKNOFP16-NEXT:    fcvt s3, h3
217; CHECKNOFP16-NEXT:    fcvt s1, h1
218; CHECKNOFP16-NEXT:    fadd s5, s17, s5
219; CHECKNOFP16-NEXT:    fadd s1, s1, s3
220; CHECKNOFP16-NEXT:    fcvt h4, s5
221; CHECKNOFP16-NEXT:    fcvt h1, s1
222; CHECKNOFP16-NEXT:    fcvt s4, h4
223; CHECKNOFP16-NEXT:    fcvt s1, h1
224; CHECKNOFP16-NEXT:    fadd s1, s1, s4
225; CHECKNOFP16-NEXT:    fcvt h0, s0
226; CHECKNOFP16-NEXT:    fcvt h1, s1
227; CHECKNOFP16-NEXT:    fcvt s1, h1
228; CHECKNOFP16-NEXT:    fcvt s0, h0
229; CHECKNOFP16-NEXT:    fadd s0, s1, s0
230; CHECKNOFP16-NEXT:    fcvt h0, s0
231; CHECKNOFP16-NEXT:    ret
232  %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half -0.0, <16 x half> %bin.rdx)
233  ret half %r
234}
235
236define float @add_2S(<8 x float> %bin.rdx)  {
237; CHECK-LABEL: add_2S:
238; CHECK:       // %bb.0:
239; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
240; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
241; CHECK-NEXT:    fadd v0.2s, v0.2s, v1.2s
242; CHECK-NEXT:    faddp s0, v0.2s
243; CHECK-NEXT:    ret
244;
245; CHECKNOFP16-LABEL: add_2S:
246; CHECKNOFP16:       // %bb.0:
247; CHECKNOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
248; CHECKNOFP16-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
249; CHECKNOFP16-NEXT:    fadd v0.2s, v0.2s, v1.2s
250; CHECKNOFP16-NEXT:    faddp s0, v0.2s
251; CHECKNOFP16-NEXT:    ret
252  %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx)
253  ret float %r
254}
255
256define double @add_2D(<4 x double> %bin.rdx)  {
257; CHECK-LABEL: add_2D:
258; CHECK:       // %bb.0:
259; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
260; CHECK-NEXT:    faddp d0, v0.2d
261; CHECK-NEXT:    ret
262;
263; CHECKNOFP16-LABEL: add_2D:
264; CHECKNOFP16:       // %bb.0:
265; CHECKNOFP16-NEXT:    fadd v0.2d, v0.2d, v1.2d
266; CHECKNOFP16-NEXT:    faddp d0, v0.2d
267; CHECKNOFP16-NEXT:    ret
268  %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx)
269  ret double %r
270}
271
272; Function Attrs: nounwind readnone
273declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
274declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
275declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
276declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
277declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
278declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
279declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
280declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
281