1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
5
6declare double @__sqrt_finite(double)
7declare float @__sqrtf_finite(float)
8declare x86_fp80 @__sqrtl_finite(x86_fp80)
9declare float @llvm.sqrt.f32(float)
10declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
11declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
12declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
13declare double @llvm.sqrt.f64(double)
14declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
15
16declare float @llvm.fabs.f32(float)
17declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
18declare double @llvm.fabs.f64(double)
19
20define double @finite_f64_no_estimate(double %d) #0 {
21; SSE-LABEL: finite_f64_no_estimate:
22; SSE:       # %bb.0:
23; SSE-NEXT:    sqrtsd %xmm0, %xmm0
24; SSE-NEXT:    retq
25;
26; AVX-LABEL: finite_f64_no_estimate:
27; AVX:       # %bb.0:
28; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
29; AVX-NEXT:    retq
30  %call = tail call double @__sqrt_finite(double %d) #2
31  ret double %call
32}
33
34; No estimates for doubles.
35
36define double @finite_f64_estimate(double %d) #1 {
37; SSE-LABEL: finite_f64_estimate:
38; SSE:       # %bb.0:
39; SSE-NEXT:    sqrtsd %xmm0, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: finite_f64_estimate:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
45; AVX-NEXT:    retq
46  %call = tail call double @__sqrt_finite(double %d) #2
47  ret double %call
48}
49
50define float @finite_f32_no_estimate(float %f) #0 {
51; SSE-LABEL: finite_f32_no_estimate:
52; SSE:       # %bb.0:
53; SSE-NEXT:    sqrtss %xmm0, %xmm0
54; SSE-NEXT:    retq
55;
56; AVX-LABEL: finite_f32_no_estimate:
57; AVX:       # %bb.0:
58; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
59; AVX-NEXT:    retq
60  %call = tail call float @__sqrtf_finite(float %f) #2
61  ret float %call
62}
63
64define float @finite_f32_estimate_ieee(float %f) #1 {
65; SSE-LABEL: finite_f32_estimate_ieee:
66; SSE:       # %bb.0:
67; SSE-NEXT:    sqrtss %xmm0, %xmm0
68; SSE-NEXT:    retq
69;
70; AVX-LABEL: finite_f32_estimate_ieee:
71; AVX:       # %bb.0:
72; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
73; AVX-NEXT:    retq
74  %call = tail call float @__sqrtf_finite(float %f) #2
75  ret float %call
76}
77
78define float @finite_f32_estimate_ieee_ninf(float %f) #1 {
79; SSE-LABEL: finite_f32_estimate_ieee_ninf:
80; SSE:       # %bb.0:
81; SSE-NEXT:    rsqrtss %xmm0, %xmm1
82; SSE-NEXT:    movaps %xmm0, %xmm2
83; SSE-NEXT:    mulss %xmm1, %xmm2
84; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
85; SSE-NEXT:    mulss %xmm2, %xmm3
86; SSE-NEXT:    mulss %xmm1, %xmm2
87; SSE-NEXT:    addss {{.*}}(%rip), %xmm2
88; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
89; SSE-NEXT:    mulss %xmm3, %xmm2
90; SSE-NEXT:    cmpltss {{.*}}(%rip), %xmm0
91; SSE-NEXT:    andnps %xmm2, %xmm0
92; SSE-NEXT:    retq
93;
94; AVX1-LABEL: finite_f32_estimate_ieee_ninf:
95; AVX1:       # %bb.0:
96; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
97; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm2
98; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
99; AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
100; AVX1-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
101; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
102; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
103; AVX1-NEXT:    vcmpltss {{.*}}(%rip), %xmm0, %xmm0
104; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0
105; AVX1-NEXT:    retq
106;
107; AVX512-LABEL: finite_f32_estimate_ieee_ninf:
108; AVX512:       # %bb.0:
109; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
110; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm2
111; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
112; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
113; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm1
114; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
115; AVX512-NEXT:    vandps %xmm2, %xmm0, %xmm0
116; AVX512-NEXT:    vcmpltss {{.*}}(%rip), %xmm0, %k1
117; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
118; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
119; AVX512-NEXT:    vmovaps %xmm1, %xmm0
120; AVX512-NEXT:    retq
121  %call = tail call ninf float @__sqrtf_finite(float %f) #2
122  ret float %call
123}
124
125define float @finite_f32_estimate_daz(float %f) #4 {
126; SSE-LABEL: finite_f32_estimate_daz:
127; SSE:       # %bb.0:
128; SSE-NEXT:    sqrtss %xmm0, %xmm0
129; SSE-NEXT:    retq
130;
131; AVX-LABEL: finite_f32_estimate_daz:
132; AVX:       # %bb.0:
133; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
134; AVX-NEXT:    retq
135  %call = tail call float @__sqrtf_finite(float %f) #2
136  ret float %call
137}
138
139define float @finite_f32_estimate_daz_ninf(float %f) #4 {
140; SSE-LABEL: finite_f32_estimate_daz_ninf:
141; SSE:       # %bb.0:
142; SSE-NEXT:    rsqrtss %xmm0, %xmm1
143; SSE-NEXT:    movaps %xmm0, %xmm2
144; SSE-NEXT:    mulss %xmm1, %xmm2
145; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
146; SSE-NEXT:    mulss %xmm2, %xmm3
147; SSE-NEXT:    mulss %xmm1, %xmm2
148; SSE-NEXT:    addss {{.*}}(%rip), %xmm2
149; SSE-NEXT:    mulss %xmm3, %xmm2
150; SSE-NEXT:    xorps %xmm1, %xmm1
151; SSE-NEXT:    cmpeqss %xmm1, %xmm0
152; SSE-NEXT:    andnps %xmm2, %xmm0
153; SSE-NEXT:    retq
154;
155; AVX1-LABEL: finite_f32_estimate_daz_ninf:
156; AVX1:       # %bb.0:
157; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
158; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm2
159; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
160; AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
161; AVX1-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
162; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
163; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
164; AVX1-NEXT:    vcmpeqss %xmm2, %xmm0, %xmm0
165; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0
166; AVX1-NEXT:    retq
167;
168; AVX512-LABEL: finite_f32_estimate_daz_ninf:
169; AVX512:       # %bb.0:
170; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
171; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm2
172; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
173; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
174; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm1
175; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
176; AVX512-NEXT:    vcmpeqss %xmm2, %xmm0, %k1
177; AVX512-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
178; AVX512-NEXT:    vmovaps %xmm1, %xmm0
179; AVX512-NEXT:    retq
180  %call = tail call ninf float @__sqrtf_finite(float %f) #2
181  ret float %call
182}
183
184define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
185; CHECK-LABEL: finite_f80_no_estimate:
186; CHECK:       # %bb.0:
187; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
188; CHECK-NEXT:    fsqrt
189; CHECK-NEXT:    retq
190  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
191  ret x86_fp80 %call
192}
193
194; Don't die on the impossible.
195
196define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
197; CHECK-LABEL: finite_f80_estimate_but_no:
198; CHECK:       # %bb.0:
199; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
200; CHECK-NEXT:    fsqrt
201; CHECK-NEXT:    retq
202  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
203  ret x86_fp80 %call
204}
205
206; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994
207
208define float @sqrtf_check_denorms(float %x) #3 {
209; SSE-LABEL: sqrtf_check_denorms:
210; SSE:       # %bb.0:
211; SSE-NEXT:    sqrtss %xmm0, %xmm0
212; SSE-NEXT:    retq
213;
214; AVX-LABEL: sqrtf_check_denorms:
215; AVX:       # %bb.0:
216; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
217; AVX-NEXT:    retq
218  %call = tail call float @__sqrtf_finite(float %x) #2
219  ret float %call
220}
221
222define float @sqrtf_check_denorms_ninf(float %x) #3 {
223; SSE-LABEL: sqrtf_check_denorms_ninf:
224; SSE:       # %bb.0:
225; SSE-NEXT:    rsqrtss %xmm0, %xmm1
226; SSE-NEXT:    movaps %xmm0, %xmm2
227; SSE-NEXT:    mulss %xmm1, %xmm2
228; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
229; SSE-NEXT:    mulss %xmm2, %xmm3
230; SSE-NEXT:    mulss %xmm1, %xmm2
231; SSE-NEXT:    addss {{.*}}(%rip), %xmm2
232; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
233; SSE-NEXT:    mulss %xmm3, %xmm2
234; SSE-NEXT:    cmpltss {{.*}}(%rip), %xmm0
235; SSE-NEXT:    andnps %xmm2, %xmm0
236; SSE-NEXT:    retq
237;
238; AVX1-LABEL: sqrtf_check_denorms_ninf:
239; AVX1:       # %bb.0:
240; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
241; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm2
242; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
243; AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
244; AVX1-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
245; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
246; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
247; AVX1-NEXT:    vcmpltss {{.*}}(%rip), %xmm0, %xmm0
248; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0
249; AVX1-NEXT:    retq
250;
251; AVX512-LABEL: sqrtf_check_denorms_ninf:
252; AVX512:       # %bb.0:
253; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
254; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm2
255; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
256; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
257; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm1
258; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
259; AVX512-NEXT:    vandps %xmm2, %xmm0, %xmm0
260; AVX512-NEXT:    vcmpltss {{.*}}(%rip), %xmm0, %k1
261; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
262; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
263; AVX512-NEXT:    vmovaps %xmm1, %xmm0
264; AVX512-NEXT:    retq
265  %call = tail call ninf float @__sqrtf_finite(float %x) #2
266  ret float %call
267}
268
269define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
270; SSE-LABEL: sqrt_v4f32_check_denorms:
271; SSE:       # %bb.0:
272; SSE-NEXT:    sqrtps %xmm0, %xmm0
273; SSE-NEXT:    retq
274;
275; AVX-LABEL: sqrt_v4f32_check_denorms:
276; AVX:       # %bb.0:
277; AVX-NEXT:    vsqrtps %xmm0, %xmm0
278; AVX-NEXT:    retq
279  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
280  ret <4 x float> %call
281}
282
283define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 {
284; SSE-LABEL: sqrt_v4f32_check_denorms_ninf:
285; SSE:       # %bb.0:
286; SSE-NEXT:    rsqrtps %xmm0, %xmm2
287; SSE-NEXT:    movaps %xmm0, %xmm1
288; SSE-NEXT:    mulps %xmm2, %xmm1
289; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
290; SSE-NEXT:    mulps %xmm1, %xmm3
291; SSE-NEXT:    mulps %xmm2, %xmm1
292; SSE-NEXT:    addps {{.*}}(%rip), %xmm1
293; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
294; SSE-NEXT:    mulps %xmm3, %xmm1
295; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
296; SSE-NEXT:    cmpleps %xmm0, %xmm2
297; SSE-NEXT:    andps %xmm2, %xmm1
298; SSE-NEXT:    movaps %xmm1, %xmm0
299; SSE-NEXT:    retq
300;
301; AVX1-LABEL: sqrt_v4f32_check_denorms_ninf:
302; AVX1:       # %bb.0:
303; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
304; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm2
305; AVX1-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm3
306; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
307; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm1, %xmm1
308; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
309; AVX1-NEXT:    vmulps %xmm1, %xmm3, %xmm1
310; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
311; AVX1-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
312; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
313; AVX1-NEXT:    retq
314;
315; AVX512-LABEL: sqrt_v4f32_check_denorms_ninf:
316; AVX512:       # %bb.0:
317; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
318; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm2
319; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
320; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
321; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
322; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
323; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
324; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
325; AVX512-NEXT:    vandps %xmm2, %xmm0, %xmm0
326; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
327; AVX512-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
328; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
329; AVX512-NEXT:    retq
330  %call = tail call ninf <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
331  ret <4 x float> %call
332}
333
334define float @f32_no_estimate(float %x) #0 {
335; SSE-LABEL: f32_no_estimate:
336; SSE:       # %bb.0:
337; SSE-NEXT:    sqrtss %xmm0, %xmm1
338; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
339; SSE-NEXT:    divss %xmm1, %xmm0
340; SSE-NEXT:    retq
341;
342; AVX-LABEL: f32_no_estimate:
343; AVX:       # %bb.0:
344; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
345; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
346; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
347; AVX-NEXT:    retq
348  %sqrt = tail call float @llvm.sqrt.f32(float %x)
349  %div = fdiv fast float 1.0, %sqrt
350  ret float %div
351}
352
353define float @f32_estimate(float %x) #1 {
354; SSE-LABEL: f32_estimate:
355; SSE:       # %bb.0:
356; SSE-NEXT:    rsqrtss %xmm0, %xmm1
357; SSE-NEXT:    mulss %xmm1, %xmm0
358; SSE-NEXT:    mulss %xmm1, %xmm0
359; SSE-NEXT:    addss {{.*}}(%rip), %xmm0
360; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1
361; SSE-NEXT:    mulss %xmm1, %xmm0
362; SSE-NEXT:    retq
363;
364; AVX1-LABEL: f32_estimate:
365; AVX1:       # %bb.0:
366; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
367; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0
368; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0
369; AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
370; AVX1-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm1
371; AVX1-NEXT:    vmulss %xmm0, %xmm1, %xmm0
372; AVX1-NEXT:    retq
373;
374; AVX512-LABEL: f32_estimate:
375; AVX512:       # %bb.0:
376; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
377; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
378; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
379; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm1
380; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
381; AVX512-NEXT:    retq
382  %sqrt = tail call float @llvm.sqrt.f32(float %x)
383  %div = fdiv fast float 1.0, %sqrt
384  ret float %div
385}
386
387define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
388; SSE-LABEL: v4f32_no_estimate:
389; SSE:       # %bb.0:
390; SSE-NEXT:    sqrtps %xmm0, %xmm1
391; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
392; SSE-NEXT:    divps %xmm1, %xmm0
393; SSE-NEXT:    retq
394;
395; AVX1-LABEL: v4f32_no_estimate:
396; AVX1:       # %bb.0:
397; AVX1-NEXT:    vsqrtps %xmm0, %xmm0
398; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
399; AVX1-NEXT:    vdivps %xmm0, %xmm1, %xmm0
400; AVX1-NEXT:    retq
401;
402; AVX512-LABEL: v4f32_no_estimate:
403; AVX512:       # %bb.0:
404; AVX512-NEXT:    vsqrtps %xmm0, %xmm0
405; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
406; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0
407; AVX512-NEXT:    retq
408  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
409  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
410  ret <4 x float> %div
411}
412
413define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
414; SSE-LABEL: v4f32_estimate:
415; SSE:       # %bb.0:
416; SSE-NEXT:    rsqrtps %xmm0, %xmm1
417; SSE-NEXT:    mulps %xmm1, %xmm0
418; SSE-NEXT:    mulps %xmm1, %xmm0
419; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
420; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
421; SSE-NEXT:    mulps %xmm1, %xmm0
422; SSE-NEXT:    retq
423;
424; AVX1-LABEL: v4f32_estimate:
425; AVX1:       # %bb.0:
426; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
427; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
428; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
429; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
430; AVX1-NEXT:    vmulps {{.*}}(%rip), %xmm1, %xmm1
431; AVX1-NEXT:    vmulps %xmm0, %xmm1, %xmm0
432; AVX1-NEXT:    retq
433;
434; AVX512-LABEL: v4f32_estimate:
435; AVX512:       # %bb.0:
436; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
437; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
438; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
439; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
440; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
441; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
442; AVX512-NEXT:    vmulps %xmm2, %xmm0, %xmm0
443; AVX512-NEXT:    retq
444  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
445  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
446  ret <4 x float> %div
447}
448
449define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
450; SSE-LABEL: v8f32_no_estimate:
451; SSE:       # %bb.0:
452; SSE-NEXT:    sqrtps %xmm1, %xmm2
453; SSE-NEXT:    sqrtps %xmm0, %xmm3
454; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
455; SSE-NEXT:    movaps %xmm1, %xmm0
456; SSE-NEXT:    divps %xmm3, %xmm0
457; SSE-NEXT:    divps %xmm2, %xmm1
458; SSE-NEXT:    retq
459;
460; AVX1-LABEL: v8f32_no_estimate:
461; AVX1:       # %bb.0:
462; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
463; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
464; AVX1-NEXT:    vdivps %ymm0, %ymm1, %ymm0
465; AVX1-NEXT:    retq
466;
467; AVX512-LABEL: v8f32_no_estimate:
468; AVX512:       # %bb.0:
469; AVX512-NEXT:    vsqrtps %ymm0, %ymm0
470; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
471; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0
472; AVX512-NEXT:    retq
473  %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
474  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
475  ret <8 x float> %div
476}
477
478define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
479; SSE-LABEL: v8f32_estimate:
480; SSE:       # %bb.0:
481; SSE-NEXT:    rsqrtps %xmm0, %xmm2
482; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
483; SSE-NEXT:    mulps %xmm2, %xmm0
484; SSE-NEXT:    mulps %xmm2, %xmm0
485; SSE-NEXT:    mulps %xmm3, %xmm2
486; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
487; SSE-NEXT:    addps %xmm4, %xmm0
488; SSE-NEXT:    mulps %xmm2, %xmm0
489; SSE-NEXT:    rsqrtps %xmm1, %xmm2
490; SSE-NEXT:    mulps %xmm2, %xmm3
491; SSE-NEXT:    mulps %xmm2, %xmm1
492; SSE-NEXT:    mulps %xmm2, %xmm1
493; SSE-NEXT:    addps %xmm4, %xmm1
494; SSE-NEXT:    mulps %xmm3, %xmm1
495; SSE-NEXT:    retq
496;
497; AVX1-LABEL: v8f32_estimate:
498; AVX1:       # %bb.0:
499; AVX1-NEXT:    vrsqrtps %ymm0, %ymm1
500; AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
501; AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
502; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
503; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
504; AVX1-NEXT:    vmulps %ymm0, %ymm1, %ymm0
505; AVX1-NEXT:    retq
506;
507; AVX512-LABEL: v8f32_estimate:
508; AVX512:       # %bb.0:
509; AVX512-NEXT:    vrsqrtps %ymm0, %ymm1
510; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
511; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
512; AVX512-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2
513; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
514; AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm0
515; AVX512-NEXT:    vmulps %ymm2, %ymm0, %ymm0
516; AVX512-NEXT:    retq
517  %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
518  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
519  ret <8 x float> %div
520}
521
522define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
523; SSE-LABEL: v16f32_no_estimate:
524; SSE:       # %bb.0:
525; SSE-NEXT:    sqrtps %xmm3, %xmm4
526; SSE-NEXT:    sqrtps %xmm2, %xmm5
527; SSE-NEXT:    sqrtps %xmm1, %xmm2
528; SSE-NEXT:    sqrtps %xmm0, %xmm1
529; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
530; SSE-NEXT:    movaps %xmm3, %xmm0
531; SSE-NEXT:    divps %xmm1, %xmm0
532; SSE-NEXT:    movaps %xmm3, %xmm1
533; SSE-NEXT:    divps %xmm2, %xmm1
534; SSE-NEXT:    movaps %xmm3, %xmm2
535; SSE-NEXT:    divps %xmm5, %xmm2
536; SSE-NEXT:    divps %xmm4, %xmm3
537; SSE-NEXT:    retq
538;
539; AVX1-LABEL: v16f32_no_estimate:
540; AVX1:       # %bb.0:
541; AVX1-NEXT:    vsqrtps %ymm1, %ymm1
542; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
543; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
544; AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
545; AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
546; AVX1-NEXT:    retq
547;
548; AVX512-LABEL: v16f32_no_estimate:
549; AVX512:       # %bb.0:
550; AVX512-NEXT:    vsqrtps %zmm0, %zmm0
551; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
552; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
553; AVX512-NEXT:    retq
554  %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
555  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
556  ret <16 x float> %div
557}
558
559define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
560; SSE-LABEL: v16f32_estimate:
561; SSE:       # %bb.0:
562; SSE-NEXT:    rsqrtps %xmm0, %xmm5
563; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
564; SSE-NEXT:    mulps %xmm5, %xmm0
565; SSE-NEXT:    mulps %xmm5, %xmm0
566; SSE-NEXT:    movaps %xmm5, %xmm6
567; SSE-NEXT:    mulps %xmm4, %xmm6
568; SSE-NEXT:    movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
569; SSE-NEXT:    addps %xmm5, %xmm0
570; SSE-NEXT:    mulps %xmm6, %xmm0
571; SSE-NEXT:    rsqrtps %xmm1, %xmm6
572; SSE-NEXT:    mulps %xmm6, %xmm1
573; SSE-NEXT:    mulps %xmm6, %xmm1
574; SSE-NEXT:    mulps %xmm4, %xmm6
575; SSE-NEXT:    addps %xmm5, %xmm1
576; SSE-NEXT:    mulps %xmm6, %xmm1
577; SSE-NEXT:    rsqrtps %xmm2, %xmm6
578; SSE-NEXT:    mulps %xmm6, %xmm2
579; SSE-NEXT:    mulps %xmm6, %xmm2
580; SSE-NEXT:    mulps %xmm4, %xmm6
581; SSE-NEXT:    addps %xmm5, %xmm2
582; SSE-NEXT:    mulps %xmm6, %xmm2
583; SSE-NEXT:    rsqrtps %xmm3, %xmm6
584; SSE-NEXT:    mulps %xmm6, %xmm4
585; SSE-NEXT:    mulps %xmm6, %xmm3
586; SSE-NEXT:    mulps %xmm6, %xmm3
587; SSE-NEXT:    addps %xmm5, %xmm3
588; SSE-NEXT:    mulps %xmm4, %xmm3
589; SSE-NEXT:    retq
590;
591; AVX1-LABEL: v16f32_estimate:
592; AVX1:       # %bb.0:
593; AVX1-NEXT:    vrsqrtps %ymm0, %ymm2
594; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
595; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm4
596; AVX1-NEXT:    vmulps %ymm2, %ymm0, %ymm0
597; AVX1-NEXT:    vmulps %ymm2, %ymm0, %ymm0
598; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
599; AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
600; AVX1-NEXT:    vmulps %ymm0, %ymm4, %ymm0
601; AVX1-NEXT:    vrsqrtps %ymm1, %ymm4
602; AVX1-NEXT:    vmulps %ymm3, %ymm4, %ymm3
603; AVX1-NEXT:    vmulps %ymm4, %ymm1, %ymm1
604; AVX1-NEXT:    vmulps %ymm4, %ymm1, %ymm1
605; AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
606; AVX1-NEXT:    vmulps %ymm1, %ymm3, %ymm1
607; AVX1-NEXT:    retq
608;
609; AVX512-LABEL: v16f32_estimate:
610; AVX512:       # %bb.0:
611; AVX512-NEXT:    vrsqrt14ps %zmm0, %zmm1
612; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
613; AVX512-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
614; AVX512-NEXT:    vmulps {{.*}}(%rip){1to16}, %zmm1, %zmm1
615; AVX512-NEXT:    vmulps %zmm0, %zmm1, %zmm0
616; AVX512-NEXT:    retq
617  %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
618  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
619  ret <16 x float> %div
620}
621
622; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
623
624define float @div_sqrt_fabs_f32(float %x, float %y, float %z) {
625; SSE-LABEL: div_sqrt_fabs_f32:
626; SSE:       # %bb.0:
627; SSE-NEXT:    mulss %xmm1, %xmm1
628; SSE-NEXT:    mulss %xmm2, %xmm1
629; SSE-NEXT:    xorps %xmm2, %xmm2
630; SSE-NEXT:    rsqrtss %xmm1, %xmm2
631; SSE-NEXT:    mulss %xmm2, %xmm1
632; SSE-NEXT:    mulss %xmm2, %xmm1
633; SSE-NEXT:    addss {{.*}}(%rip), %xmm1
634; SSE-NEXT:    mulss {{.*}}(%rip), %xmm2
635; SSE-NEXT:    mulss %xmm0, %xmm2
636; SSE-NEXT:    mulss %xmm1, %xmm2
637; SSE-NEXT:    movaps %xmm2, %xmm0
638; SSE-NEXT:    retq
639;
640; AVX1-LABEL: div_sqrt_fabs_f32:
641; AVX1:       # %bb.0:
642; AVX1-NEXT:    vmulss %xmm1, %xmm1, %xmm1
643; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
644; AVX1-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
645; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
646; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
647; AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
648; AVX1-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
649; AVX1-NEXT:    vmulss %xmm0, %xmm2, %xmm0
650; AVX1-NEXT:    vmulss %xmm0, %xmm1, %xmm0
651; AVX1-NEXT:    retq
652;
653; AVX512-LABEL: div_sqrt_fabs_f32:
654; AVX512:       # %bb.0:
655; AVX512-NEXT:    vmulss %xmm1, %xmm1, %xmm1
656; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
657; AVX512-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
658; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
659; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
660; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
661; AVX512-NEXT:    vmulss %xmm0, %xmm2, %xmm0
662; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
663; AVX512-NEXT:    retq
664  %s = call fast float @llvm.sqrt.f32(float %z)
665  %a = call fast float @llvm.fabs.f32(float %y)
666  %m = fmul fast float %s, %a
667  %d = fdiv fast float %x, %m
668  ret float %d
669}
670
671; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
672
673define <4 x float> @div_sqrt_fabs_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
674; SSE-LABEL: div_sqrt_fabs_v4f32:
675; SSE:       # %bb.0:
676; SSE-NEXT:    mulps %xmm1, %xmm1
677; SSE-NEXT:    mulps %xmm2, %xmm1
678; SSE-NEXT:    rsqrtps %xmm1, %xmm2
679; SSE-NEXT:    mulps %xmm2, %xmm1
680; SSE-NEXT:    mulps %xmm2, %xmm1
681; SSE-NEXT:    addps {{.*}}(%rip), %xmm1
682; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
683; SSE-NEXT:    mulps %xmm1, %xmm2
684; SSE-NEXT:    mulps %xmm2, %xmm0
685; SSE-NEXT:    retq
686;
687; AVX1-LABEL: div_sqrt_fabs_v4f32:
688; AVX1:       # %bb.0:
689; AVX1-NEXT:    vmulps %xmm1, %xmm1, %xmm1
690; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
691; AVX1-NEXT:    vrsqrtps %xmm1, %xmm2
692; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
693; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
694; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm1, %xmm1
695; AVX1-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm2
696; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
697; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
698; AVX1-NEXT:    retq
699;
700; AVX512-LABEL: div_sqrt_fabs_v4f32:
701; AVX512:       # %bb.0:
702; AVX512-NEXT:    vmulps %xmm1, %xmm1, %xmm1
703; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
704; AVX512-NEXT:    vrsqrtps %xmm1, %xmm2
705; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
706; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
707; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
708; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
709; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
710; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
711; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
712; AVX512-NEXT:    retq
713  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
714  %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
715  %m = fmul reassoc <4 x float> %a, %s
716  %d = fdiv reassoc arcp <4 x float> %x, %m
717  ret <4 x float> %d
718}
719
720; This has 'arcp' but does not have 'reassoc' FMF.
721; We allow converting the sqrt to an estimate, but
722; do not pull the divisor into the estimate.
723; x / (fabs(y) * sqrt(z)) --> x * rsqrt(z) / fabs(y)
724
725define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
726; SSE-LABEL: div_sqrt_fabs_v4f32_fmf:
727; SSE:       # %bb.0:
728; SSE-NEXT:    rsqrtps %xmm2, %xmm3
729; SSE-NEXT:    mulps %xmm3, %xmm2
730; SSE-NEXT:    mulps %xmm3, %xmm2
731; SSE-NEXT:    addps {{.*}}(%rip), %xmm2
732; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
733; SSE-NEXT:    mulps %xmm2, %xmm3
734; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
735; SSE-NEXT:    divps %xmm1, %xmm3
736; SSE-NEXT:    mulps %xmm3, %xmm0
737; SSE-NEXT:    retq
738;
739; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf:
740; AVX1:       # %bb.0:
741; AVX1-NEXT:    vrsqrtps %xmm2, %xmm3
742; AVX1-NEXT:    vmulps %xmm3, %xmm2, %xmm2
743; AVX1-NEXT:    vmulps %xmm3, %xmm2, %xmm2
744; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm2, %xmm2
745; AVX1-NEXT:    vmulps {{.*}}(%rip), %xmm3, %xmm3
746; AVX1-NEXT:    vmulps %xmm2, %xmm3, %xmm2
747; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
748; AVX1-NEXT:    vdivps %xmm1, %xmm2, %xmm1
749; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
750; AVX1-NEXT:    retq
751;
752; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf:
753; AVX512:       # %bb.0:
754; AVX512-NEXT:    vrsqrtps %xmm2, %xmm3
755; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
756; AVX512-NEXT:    vmulps %xmm4, %xmm3, %xmm4
757; AVX512-NEXT:    vmulps %xmm3, %xmm2, %xmm2
758; AVX512-NEXT:    vmulps %xmm3, %xmm2, %xmm2
759; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
760; AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm2
761; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
762; AVX512-NEXT:    vmulps %xmm2, %xmm4, %xmm2
763; AVX512-NEXT:    vandps %xmm3, %xmm1, %xmm1
764; AVX512-NEXT:    vdivps %xmm1, %xmm2, %xmm1
765; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
766; AVX512-NEXT:    retq
767  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
768  %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
769  %m = fmul <4 x float> %a, %s
770  %d = fdiv arcp <4 x float> %x, %m
771  ret <4 x float> %d
772}
773
774; No estimates for f64, so do not convert fabs into an fmul.
775
776define double @div_sqrt_fabs_f64(double %x, double %y, double %z) {
777; SSE-LABEL: div_sqrt_fabs_f64:
778; SSE:       # %bb.0:
779; SSE-NEXT:    sqrtsd %xmm2, %xmm2
780; SSE-NEXT:    andpd {{.*}}(%rip), %xmm1
781; SSE-NEXT:    mulsd %xmm2, %xmm1
782; SSE-NEXT:    divsd %xmm1, %xmm0
783; SSE-NEXT:    retq
784;
785; AVX-LABEL: div_sqrt_fabs_f64:
786; AVX:       # %bb.0:
787; AVX-NEXT:    vsqrtsd %xmm2, %xmm2, %xmm2
788; AVX-NEXT:    vandpd {{.*}}(%rip), %xmm1, %xmm1
789; AVX-NEXT:    vmulsd %xmm1, %xmm2, %xmm1
790; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
791; AVX-NEXT:    retq
792  %s = call fast double @llvm.sqrt.f64(double %z)
793  %a = call fast double @llvm.fabs.f64(double %y)
794  %m = fmul fast double %s, %a
795  %d = fdiv fast double %x, %m
796  ret double %d
797}
798
799; This is a special case for the general pattern above -
800; if the sqrt operand is the same as the other mul op,
801; then fabs may be omitted.
802; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
803
804define float @div_sqrt_f32(float %x, float %y) {
805; SSE-LABEL: div_sqrt_f32:
806; SSE:       # %bb.0:
807; SSE-NEXT:    movaps %xmm1, %xmm2
808; SSE-NEXT:    mulss %xmm1, %xmm2
809; SSE-NEXT:    mulss %xmm1, %xmm2
810; SSE-NEXT:    xorps %xmm1, %xmm1
811; SSE-NEXT:    rsqrtss %xmm2, %xmm1
812; SSE-NEXT:    mulss %xmm1, %xmm2
813; SSE-NEXT:    mulss %xmm1, %xmm2
814; SSE-NEXT:    addss {{.*}}(%rip), %xmm2
815; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1
816; SSE-NEXT:    mulss %xmm0, %xmm1
817; SSE-NEXT:    mulss %xmm2, %xmm1
818; SSE-NEXT:    movaps %xmm1, %xmm0
819; SSE-NEXT:    retq
820;
821; AVX1-LABEL: div_sqrt_f32:
822; AVX1:       # %bb.0:
823; AVX1-NEXT:    vmulss %xmm1, %xmm1, %xmm2
824; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
825; AVX1-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
826; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
827; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
828; AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
829; AVX1-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
830; AVX1-NEXT:    vmulss %xmm0, %xmm2, %xmm0
831; AVX1-NEXT:    vmulss %xmm0, %xmm1, %xmm0
832; AVX1-NEXT:    retq
833;
834; AVX512-LABEL: div_sqrt_f32:
835; AVX512:       # %bb.0:
836; AVX512-NEXT:    vmulss %xmm1, %xmm1, %xmm2
837; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm1
838; AVX512-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
839; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
840; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
841; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
842; AVX512-NEXT:    vmulss %xmm0, %xmm2, %xmm0
843; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
844; AVX512-NEXT:    retq
845  %s = call fast float @llvm.sqrt.f32(float %y)
846  %m = fmul fast float %s, %y
847  %d = fdiv fast float %x, %m
848  ret float %d
849}
850
851; This is a special case for the general pattern above -
852; if the sqrt operand is the same as the other mul op,
853; then fabs may be omitted.
854; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
855
856define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) {
857; SSE-LABEL: div_sqrt_v4f32:
858; SSE:       # %bb.0:
859; SSE-NEXT:    movaps %xmm1, %xmm2
860; SSE-NEXT:    mulps %xmm1, %xmm2
861; SSE-NEXT:    mulps %xmm1, %xmm2
862; SSE-NEXT:    rsqrtps %xmm2, %xmm1
863; SSE-NEXT:    mulps %xmm1, %xmm2
864; SSE-NEXT:    mulps %xmm1, %xmm2
865; SSE-NEXT:    addps {{.*}}(%rip), %xmm2
866; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
867; SSE-NEXT:    mulps %xmm2, %xmm1
868; SSE-NEXT:    mulps %xmm1, %xmm0
869; SSE-NEXT:    retq
870;
871; AVX1-LABEL: div_sqrt_v4f32:
872; AVX1:       # %bb.0:
873; AVX1-NEXT:    vmulps %xmm1, %xmm1, %xmm2
874; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
875; AVX1-NEXT:    vrsqrtps %xmm1, %xmm2
876; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
877; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
878; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm1, %xmm1
879; AVX1-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm2
880; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
881; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
882; AVX1-NEXT:    retq
883;
884; AVX512-LABEL: div_sqrt_v4f32:
885; AVX512:       # %bb.0:
886; AVX512-NEXT:    vmulps %xmm1, %xmm1, %xmm2
887; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
888; AVX512-NEXT:    vrsqrtps %xmm1, %xmm2
889; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
890; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
891; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
892; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
893; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
894; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
895; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
896; AVX512-NEXT:    retq
897  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %y)
898  %m = fmul reassoc <4 x float> %y, %s
899  %d = fdiv reassoc arcp <4 x float> %x, %m
900  ret <4 x float> %d
901}
902
903define double @sqrt_fdiv_common_operand(double %x) nounwind {
904; SSE-LABEL: sqrt_fdiv_common_operand:
905; SSE:       # %bb.0:
906; SSE-NEXT:    sqrtsd %xmm0, %xmm0
907; SSE-NEXT:    retq
908;
909; AVX-LABEL: sqrt_fdiv_common_operand:
910; AVX:       # %bb.0:
911; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
912; AVX-NEXT:    retq
913  %sqrt = call fast double @llvm.sqrt.f64(double %x)
914  %r = fdiv fast double %x, %sqrt
915  ret double %r
916}
917
918define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind {
919; SSE-LABEL: sqrt_fdiv_common_operand_vec:
920; SSE:       # %bb.0:
921; SSE-NEXT:    sqrtpd %xmm0, %xmm0
922; SSE-NEXT:    retq
923;
924; AVX-LABEL: sqrt_fdiv_common_operand_vec:
925; AVX:       # %bb.0:
926; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
927; AVX-NEXT:    retq
928  %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
929  %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt
930  ret <2 x double> %r
931}
932
933define double @sqrt_fdiv_common_operand_extra_use(double %x, double* %p) nounwind {
934; SSE-LABEL: sqrt_fdiv_common_operand_extra_use:
935; SSE:       # %bb.0:
936; SSE-NEXT:    sqrtsd %xmm0, %xmm0
937; SSE-NEXT:    movsd %xmm0, (%rdi)
938; SSE-NEXT:    retq
939;
940; AVX-LABEL: sqrt_fdiv_common_operand_extra_use:
941; AVX:       # %bb.0:
942; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
943; AVX-NEXT:    vmovsd %xmm0, (%rdi)
944; AVX-NEXT:    retq
945  %sqrt = call fast double @llvm.sqrt.f64(double %x)
946  store double %sqrt, double* %p
947  %r = fdiv fast double %x, %sqrt
948  ret double %r
949}
950
951define double @sqrt_simplify_before_recip(double %x, double* %p) nounwind {
952; SSE-LABEL: sqrt_simplify_before_recip:
953; SSE:       # %bb.0:
954; SSE-NEXT:    sqrtsd %xmm0, %xmm0
955; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
956; SSE-NEXT:    divsd %xmm0, %xmm1
957; SSE-NEXT:    movsd %xmm1, (%rdi)
958; SSE-NEXT:    retq
959;
960; AVX-LABEL: sqrt_simplify_before_recip:
961; AVX:       # %bb.0:
962; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
963; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
964; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
965; AVX-NEXT:    vmovsd %xmm1, (%rdi)
966; AVX-NEXT:    retq
967  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
968  %rsqrt = fdiv fast double 1.0, %sqrt
969  %sqrt_fast = fdiv fast double %x, %sqrt
970  store double %rsqrt, double* %p, align 8
971  ret double %sqrt_fast
972}
973
974define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, <2 x double>* %p) nounwind {
975; SSE-LABEL: sqrt_simplify_before_recip_vec:
976; SSE:       # %bb.0:
977; SSE-NEXT:    sqrtpd %xmm0, %xmm0
978; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
979; SSE-NEXT:    divpd %xmm0, %xmm1
980; SSE-NEXT:    movupd %xmm1, (%rdi)
981; SSE-NEXT:    retq
982;
983; AVX-LABEL: sqrt_simplify_before_recip_vec:
984; AVX:       # %bb.0:
985; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
986; AVX-NEXT:    vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
987; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
988; AVX-NEXT:    vmovupd %xmm1, (%rdi)
989; AVX-NEXT:    retq
990  %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
991  %rsqrt = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt
992  %sqrt_fast = fdiv fast <2 x double> %x, %sqrt
993  store <2 x double> %rsqrt, <2 x double>* %p, align 8
994  ret <2 x double> %sqrt_fast
995}
996
997define double @sqrt_simplify_before_recip_order(double %x, double* %p) nounwind {
998; SSE-LABEL: sqrt_simplify_before_recip_order:
999; SSE:       # %bb.0:
1000; SSE-NEXT:    sqrtsd %xmm0, %xmm0
1001; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
1002; SSE-NEXT:    divsd %xmm0, %xmm1
1003; SSE-NEXT:    movsd %xmm1, (%rdi)
1004; SSE-NEXT:    retq
1005;
1006; AVX-LABEL: sqrt_simplify_before_recip_order:
1007; AVX:       # %bb.0:
1008; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
1009; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1010; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
1011; AVX-NEXT:    vmovsd %xmm1, (%rdi)
1012; AVX-NEXT:    retq
1013  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
1014  %sqrt_fast = fdiv fast double %x, %sqrt
1015  %rsqrt = fdiv fast double 42.0, %sqrt
1016  store double %rsqrt, double* %p, align 8
1017  ret double %sqrt_fast
1018}
1019
1020attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
1021attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
1022attributes #2 = { nounwind readnone }
1023attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee" }
1024attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
1025