1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 5 6declare double @__sqrt_finite(double) 7declare float @__sqrtf_finite(float) 8declare x86_fp80 @__sqrtl_finite(x86_fp80) 9declare float @llvm.sqrt.f32(float) 10declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 11declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) 12declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) 13declare double @llvm.sqrt.f64(double) 14declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) 15 16declare float @llvm.fabs.f32(float) 17declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 18declare double @llvm.fabs.f64(double) 19 20define double @finite_f64_no_estimate(double %d) #0 { 21; SSE-LABEL: finite_f64_no_estimate: 22; SSE: # %bb.0: 23; SSE-NEXT: sqrtsd %xmm0, %xmm0 24; SSE-NEXT: retq 25; 26; AVX-LABEL: finite_f64_no_estimate: 27; AVX: # %bb.0: 28; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 29; AVX-NEXT: retq 30 %call = tail call double @__sqrt_finite(double %d) #2 31 ret double %call 32} 33 34; No estimates for doubles. 35 36define double @finite_f64_estimate(double %d) #1 { 37; SSE-LABEL: finite_f64_estimate: 38; SSE: # %bb.0: 39; SSE-NEXT: sqrtsd %xmm0, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: finite_f64_estimate: 43; AVX: # %bb.0: 44; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 45; AVX-NEXT: retq 46 %call = tail call double @__sqrt_finite(double %d) #2 47 ret double %call 48} 49 50define float @finite_f32_no_estimate(float %f) #0 { 51; SSE-LABEL: finite_f32_no_estimate: 52; SSE: # %bb.0: 53; SSE-NEXT: sqrtss %xmm0, %xmm0 54; SSE-NEXT: retq 55; 56; AVX-LABEL: finite_f32_no_estimate: 57; AVX: # %bb.0: 58; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 59; AVX-NEXT: retq 60 %call = tail call float @__sqrtf_finite(float %f) #2 61 ret float %call 62} 63 64define float @finite_f32_estimate_ieee(float %f) #1 { 65; SSE-LABEL: finite_f32_estimate_ieee: 66; SSE: # %bb.0: 67; SSE-NEXT: sqrtss %xmm0, %xmm0 68; SSE-NEXT: retq 69; 70; AVX-LABEL: finite_f32_estimate_ieee: 71; AVX: # %bb.0: 72; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 73; AVX-NEXT: retq 74 %call = tail call float @__sqrtf_finite(float %f) #2 75 ret float %call 76} 77 78define float @finite_f32_estimate_ieee_ninf(float %f) #1 { 79; SSE-LABEL: finite_f32_estimate_ieee_ninf: 80; SSE: # %bb.0: 81; SSE-NEXT: rsqrtss %xmm0, %xmm1 82; SSE-NEXT: movaps %xmm0, %xmm2 83; SSE-NEXT: mulss %xmm1, %xmm2 84; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 85; SSE-NEXT: mulss %xmm2, %xmm3 86; SSE-NEXT: mulss %xmm1, %xmm2 87; SSE-NEXT: addss {{.*}}(%rip), %xmm2 88; SSE-NEXT: andps {{.*}}(%rip), %xmm0 89; SSE-NEXT: mulss %xmm3, %xmm2 90; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 91; SSE-NEXT: andnps %xmm2, %xmm0 92; SSE-NEXT: retq 93; 94; AVX1-LABEL: finite_f32_estimate_ieee_ninf: 95; AVX1: # %bb.0: 96; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 97; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 98; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 99; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 100; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 101; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 102; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 103; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 104; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 105; AVX1-NEXT: retq 106; 107; AVX512-LABEL: finite_f32_estimate_ieee_ninf: 108; AVX512: # %bb.0: 109; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 110; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2 111; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 112; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 113; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1 114; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 115; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 116; AVX512-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %k1 117; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 118; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 119; AVX512-NEXT: vmovaps %xmm1, %xmm0 120; AVX512-NEXT: retq 121 %call = tail call ninf float @__sqrtf_finite(float %f) #2 122 ret float %call 123} 124 125define float @finite_f32_estimate_daz(float %f) #4 { 126; SSE-LABEL: finite_f32_estimate_daz: 127; SSE: # %bb.0: 128; SSE-NEXT: sqrtss %xmm0, %xmm0 129; SSE-NEXT: retq 130; 131; AVX-LABEL: finite_f32_estimate_daz: 132; AVX: # %bb.0: 133; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 134; AVX-NEXT: retq 135 %call = tail call float @__sqrtf_finite(float %f) #2 136 ret float %call 137} 138 139define float @finite_f32_estimate_daz_ninf(float %f) #4 { 140; SSE-LABEL: finite_f32_estimate_daz_ninf: 141; SSE: # %bb.0: 142; SSE-NEXT: rsqrtss %xmm0, %xmm1 143; SSE-NEXT: movaps %xmm0, %xmm2 144; SSE-NEXT: mulss %xmm1, %xmm2 145; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 146; SSE-NEXT: mulss %xmm2, %xmm3 147; SSE-NEXT: mulss %xmm1, %xmm2 148; SSE-NEXT: addss {{.*}}(%rip), %xmm2 149; SSE-NEXT: mulss %xmm3, %xmm2 150; SSE-NEXT: xorps %xmm1, %xmm1 151; SSE-NEXT: cmpeqss %xmm1, %xmm0 152; SSE-NEXT: andnps %xmm2, %xmm0 153; SSE-NEXT: retq 154; 155; AVX1-LABEL: finite_f32_estimate_daz_ninf: 156; AVX1: # %bb.0: 157; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 158; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 159; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 160; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 161; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 162; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 163; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 164; AVX1-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0 165; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 166; AVX1-NEXT: retq 167; 168; AVX512-LABEL: finite_f32_estimate_daz_ninf: 169; AVX512: # %bb.0: 170; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 171; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2 172; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 173; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 174; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1 175; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 176; AVX512-NEXT: vcmpeqss %xmm2, %xmm0, %k1 177; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} 178; AVX512-NEXT: vmovaps %xmm1, %xmm0 179; AVX512-NEXT: retq 180 %call = tail call ninf float @__sqrtf_finite(float %f) #2 181 ret float %call 182} 183 184define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 { 185; CHECK-LABEL: finite_f80_no_estimate: 186; CHECK: # %bb.0: 187; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 188; CHECK-NEXT: fsqrt 189; CHECK-NEXT: retq 190 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2 191 ret x86_fp80 %call 192} 193 194; Don't die on the impossible. 195 196define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 { 197; CHECK-LABEL: finite_f80_estimate_but_no: 198; CHECK: # %bb.0: 199; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 200; CHECK-NEXT: fsqrt 201; CHECK-NEXT: retq 202 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2 203 ret x86_fp80 %call 204} 205 206; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994 207 208define float @sqrtf_check_denorms(float %x) #3 { 209; SSE-LABEL: sqrtf_check_denorms: 210; SSE: # %bb.0: 211; SSE-NEXT: sqrtss %xmm0, %xmm0 212; SSE-NEXT: retq 213; 214; AVX-LABEL: sqrtf_check_denorms: 215; AVX: # %bb.0: 216; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 217; AVX-NEXT: retq 218 %call = tail call float @__sqrtf_finite(float %x) #2 219 ret float %call 220} 221 222define float @sqrtf_check_denorms_ninf(float %x) #3 { 223; SSE-LABEL: sqrtf_check_denorms_ninf: 224; SSE: # %bb.0: 225; SSE-NEXT: rsqrtss %xmm0, %xmm1 226; SSE-NEXT: movaps %xmm0, %xmm2 227; SSE-NEXT: mulss %xmm1, %xmm2 228; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 229; SSE-NEXT: mulss %xmm2, %xmm3 230; SSE-NEXT: mulss %xmm1, %xmm2 231; SSE-NEXT: addss {{.*}}(%rip), %xmm2 232; SSE-NEXT: andps {{.*}}(%rip), %xmm0 233; SSE-NEXT: mulss %xmm3, %xmm2 234; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 235; SSE-NEXT: andnps %xmm2, %xmm0 236; SSE-NEXT: retq 237; 238; AVX1-LABEL: sqrtf_check_denorms_ninf: 239; AVX1: # %bb.0: 240; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 241; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 242; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 243; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 244; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 245; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 246; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 247; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 248; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 249; AVX1-NEXT: retq 250; 251; AVX512-LABEL: sqrtf_check_denorms_ninf: 252; AVX512: # %bb.0: 253; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 254; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2 255; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 256; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 257; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1 258; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 259; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 260; AVX512-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %k1 261; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 262; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 263; AVX512-NEXT: vmovaps %xmm1, %xmm0 264; AVX512-NEXT: retq 265 %call = tail call ninf float @__sqrtf_finite(float %x) #2 266 ret float %call 267} 268 269define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { 270; SSE-LABEL: sqrt_v4f32_check_denorms: 271; SSE: # %bb.0: 272; SSE-NEXT: sqrtps %xmm0, %xmm0 273; SSE-NEXT: retq 274; 275; AVX-LABEL: sqrt_v4f32_check_denorms: 276; AVX: # %bb.0: 277; AVX-NEXT: vsqrtps %xmm0, %xmm0 278; AVX-NEXT: retq 279 %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 280 ret <4 x float> %call 281} 282 283define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 { 284; SSE-LABEL: sqrt_v4f32_check_denorms_ninf: 285; SSE: # %bb.0: 286; SSE-NEXT: rsqrtps %xmm0, %xmm2 287; SSE-NEXT: movaps %xmm0, %xmm1 288; SSE-NEXT: mulps %xmm2, %xmm1 289; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 290; SSE-NEXT: mulps %xmm1, %xmm3 291; SSE-NEXT: mulps %xmm2, %xmm1 292; SSE-NEXT: addps {{.*}}(%rip), %xmm1 293; SSE-NEXT: andps {{.*}}(%rip), %xmm0 294; SSE-NEXT: mulps %xmm3, %xmm1 295; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 296; SSE-NEXT: cmpleps %xmm0, %xmm2 297; SSE-NEXT: andps %xmm2, %xmm1 298; SSE-NEXT: movaps %xmm1, %xmm0 299; SSE-NEXT: retq 300; 301; AVX1-LABEL: sqrt_v4f32_check_denorms_ninf: 302; AVX1: # %bb.0: 303; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 304; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2 305; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3 306; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 307; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 308; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 309; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 310; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 311; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 312; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 313; AVX1-NEXT: retq 314; 315; AVX512-LABEL: sqrt_v4f32_check_denorms_ninf: 316; AVX512: # %bb.0: 317; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 318; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2 319; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 320; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 321; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 322; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 323; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 324; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 325; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 326; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 327; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 328; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 329; AVX512-NEXT: retq 330 %call = tail call ninf <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 331 ret <4 x float> %call 332} 333 334define float @f32_no_estimate(float %x) #0 { 335; SSE-LABEL: f32_no_estimate: 336; SSE: # %bb.0: 337; SSE-NEXT: sqrtss %xmm0, %xmm1 338; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 339; SSE-NEXT: divss %xmm1, %xmm0 340; SSE-NEXT: retq 341; 342; AVX-LABEL: f32_no_estimate: 343; AVX: # %bb.0: 344; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 345; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 346; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 347; AVX-NEXT: retq 348 %sqrt = tail call float @llvm.sqrt.f32(float %x) 349 %div = fdiv fast float 1.0, %sqrt 350 ret float %div 351} 352 353define float @f32_estimate(float %x) #1 { 354; SSE-LABEL: f32_estimate: 355; SSE: # %bb.0: 356; SSE-NEXT: rsqrtss %xmm0, %xmm1 357; SSE-NEXT: mulss %xmm1, %xmm0 358; SSE-NEXT: mulss %xmm1, %xmm0 359; SSE-NEXT: addss {{.*}}(%rip), %xmm0 360; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 361; SSE-NEXT: mulss %xmm1, %xmm0 362; SSE-NEXT: retq 363; 364; AVX1-LABEL: f32_estimate: 365; AVX1: # %bb.0: 366; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 367; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 368; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 369; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 370; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 371; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 372; AVX1-NEXT: retq 373; 374; AVX512-LABEL: f32_estimate: 375; AVX512: # %bb.0: 376; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 377; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 378; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem 379; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 380; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 381; AVX512-NEXT: retq 382 %sqrt = tail call float @llvm.sqrt.f32(float %x) 383 %div = fdiv fast float 1.0, %sqrt 384 ret float %div 385} 386 387define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { 388; SSE-LABEL: v4f32_no_estimate: 389; SSE: # %bb.0: 390; SSE-NEXT: sqrtps %xmm0, %xmm1 391; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 392; SSE-NEXT: divps %xmm1, %xmm0 393; SSE-NEXT: retq 394; 395; AVX1-LABEL: v4f32_no_estimate: 396; AVX1: # %bb.0: 397; AVX1-NEXT: vsqrtps %xmm0, %xmm0 398; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 399; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0 400; AVX1-NEXT: retq 401; 402; AVX512-LABEL: v4f32_no_estimate: 403; AVX512: # %bb.0: 404; AVX512-NEXT: vsqrtps %xmm0, %xmm0 405; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 406; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 407; AVX512-NEXT: retq 408 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 409 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 410 ret <4 x float> %div 411} 412 413define <4 x float> @v4f32_estimate(<4 x float> %x) #1 { 414; SSE-LABEL: v4f32_estimate: 415; SSE: # %bb.0: 416; SSE-NEXT: rsqrtps %xmm0, %xmm1 417; SSE-NEXT: mulps %xmm1, %xmm0 418; SSE-NEXT: mulps %xmm1, %xmm0 419; SSE-NEXT: addps {{.*}}(%rip), %xmm0 420; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 421; SSE-NEXT: mulps %xmm1, %xmm0 422; SSE-NEXT: retq 423; 424; AVX1-LABEL: v4f32_estimate: 425; AVX1: # %bb.0: 426; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 427; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 428; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 429; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 430; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1 431; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0 432; AVX1-NEXT: retq 433; 434; AVX512-LABEL: v4f32_estimate: 435; AVX512: # %bb.0: 436; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 437; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 438; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 439; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 440; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 441; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 442; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm0 443; AVX512-NEXT: retq 444 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 445 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 446 ret <4 x float> %div 447} 448 449define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { 450; SSE-LABEL: v8f32_no_estimate: 451; SSE: # %bb.0: 452; SSE-NEXT: sqrtps %xmm1, %xmm2 453; SSE-NEXT: sqrtps %xmm0, %xmm3 454; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 455; SSE-NEXT: movaps %xmm1, %xmm0 456; SSE-NEXT: divps %xmm3, %xmm0 457; SSE-NEXT: divps %xmm2, %xmm1 458; SSE-NEXT: retq 459; 460; AVX1-LABEL: v8f32_no_estimate: 461; AVX1: # %bb.0: 462; AVX1-NEXT: vsqrtps %ymm0, %ymm0 463; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 464; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0 465; AVX1-NEXT: retq 466; 467; AVX512-LABEL: v8f32_no_estimate: 468; AVX512: # %bb.0: 469; AVX512-NEXT: vsqrtps %ymm0, %ymm0 470; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 471; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 472; AVX512-NEXT: retq 473 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) 474 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 475 ret <8 x float> %div 476} 477 478define <8 x float> @v8f32_estimate(<8 x float> %x) #1 { 479; SSE-LABEL: v8f32_estimate: 480; SSE: # %bb.0: 481; SSE-NEXT: rsqrtps %xmm0, %xmm2 482; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 483; SSE-NEXT: mulps %xmm2, %xmm0 484; SSE-NEXT: mulps %xmm2, %xmm0 485; SSE-NEXT: mulps %xmm3, %xmm2 486; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 487; SSE-NEXT: addps %xmm4, %xmm0 488; SSE-NEXT: mulps %xmm2, %xmm0 489; SSE-NEXT: rsqrtps %xmm1, %xmm2 490; SSE-NEXT: mulps %xmm2, %xmm3 491; SSE-NEXT: mulps %xmm2, %xmm1 492; SSE-NEXT: mulps %xmm2, %xmm1 493; SSE-NEXT: addps %xmm4, %xmm1 494; SSE-NEXT: mulps %xmm3, %xmm1 495; SSE-NEXT: retq 496; 497; AVX1-LABEL: v8f32_estimate: 498; AVX1: # %bb.0: 499; AVX1-NEXT: vrsqrtps %ymm0, %ymm1 500; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 501; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 502; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 503; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 504; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0 505; AVX1-NEXT: retq 506; 507; AVX512-LABEL: v8f32_estimate: 508; AVX512: # %bb.0: 509; AVX512-NEXT: vrsqrtps %ymm0, %ymm1 510; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 511; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 512; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2 513; AVX512-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 514; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0 515; AVX512-NEXT: vmulps %ymm2, %ymm0, %ymm0 516; AVX512-NEXT: retq 517 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) 518 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 519 ret <8 x float> %div 520} 521 522define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { 523; SSE-LABEL: v16f32_no_estimate: 524; SSE: # %bb.0: 525; SSE-NEXT: sqrtps %xmm3, %xmm4 526; SSE-NEXT: sqrtps %xmm2, %xmm5 527; SSE-NEXT: sqrtps %xmm1, %xmm2 528; SSE-NEXT: sqrtps %xmm0, %xmm1 529; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 530; SSE-NEXT: movaps %xmm3, %xmm0 531; SSE-NEXT: divps %xmm1, %xmm0 532; SSE-NEXT: movaps %xmm3, %xmm1 533; SSE-NEXT: divps %xmm2, %xmm1 534; SSE-NEXT: movaps %xmm3, %xmm2 535; SSE-NEXT: divps %xmm5, %xmm2 536; SSE-NEXT: divps %xmm4, %xmm3 537; SSE-NEXT: retq 538; 539; AVX1-LABEL: v16f32_no_estimate: 540; AVX1: # %bb.0: 541; AVX1-NEXT: vsqrtps %ymm1, %ymm1 542; AVX1-NEXT: vsqrtps %ymm0, %ymm0 543; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 544; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0 545; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1 546; AVX1-NEXT: retq 547; 548; AVX512-LABEL: v16f32_no_estimate: 549; AVX512: # %bb.0: 550; AVX512-NEXT: vsqrtps %zmm0, %zmm0 551; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 552; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 553; AVX512-NEXT: retq 554 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x) 555 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 556 ret <16 x float> %div 557} 558 559define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { 560; SSE-LABEL: v16f32_estimate: 561; SSE: # %bb.0: 562; SSE-NEXT: rsqrtps %xmm0, %xmm5 563; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 564; SSE-NEXT: mulps %xmm5, %xmm0 565; SSE-NEXT: mulps %xmm5, %xmm0 566; SSE-NEXT: movaps %xmm5, %xmm6 567; SSE-NEXT: mulps %xmm4, %xmm6 568; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 569; SSE-NEXT: addps %xmm5, %xmm0 570; SSE-NEXT: mulps %xmm6, %xmm0 571; SSE-NEXT: rsqrtps %xmm1, %xmm6 572; SSE-NEXT: mulps %xmm6, %xmm1 573; SSE-NEXT: mulps %xmm6, %xmm1 574; SSE-NEXT: mulps %xmm4, %xmm6 575; SSE-NEXT: addps %xmm5, %xmm1 576; SSE-NEXT: mulps %xmm6, %xmm1 577; SSE-NEXT: rsqrtps %xmm2, %xmm6 578; SSE-NEXT: mulps %xmm6, %xmm2 579; SSE-NEXT: mulps %xmm6, %xmm2 580; SSE-NEXT: mulps %xmm4, %xmm6 581; SSE-NEXT: addps %xmm5, %xmm2 582; SSE-NEXT: mulps %xmm6, %xmm2 583; SSE-NEXT: rsqrtps %xmm3, %xmm6 584; SSE-NEXT: mulps %xmm6, %xmm4 585; SSE-NEXT: mulps %xmm6, %xmm3 586; SSE-NEXT: mulps %xmm6, %xmm3 587; SSE-NEXT: addps %xmm5, %xmm3 588; SSE-NEXT: mulps %xmm4, %xmm3 589; SSE-NEXT: retq 590; 591; AVX1-LABEL: v16f32_estimate: 592; AVX1: # %bb.0: 593; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 594; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 595; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 596; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 597; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 598; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 599; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 600; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 601; AVX1-NEXT: vrsqrtps %ymm1, %ymm4 602; AVX1-NEXT: vmulps %ymm3, %ymm4, %ymm3 603; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1 604; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1 605; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 606; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1 607; AVX1-NEXT: retq 608; 609; AVX512-LABEL: v16f32_estimate: 610; AVX512: # %bb.0: 611; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1 612; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 613; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem 614; AVX512-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm1, %zmm1 615; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 616; AVX512-NEXT: retq 617 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x) 618 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 619 ret <16 x float> %div 620} 621 622; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z) 623 624define float @div_sqrt_fabs_f32(float %x, float %y, float %z) { 625; SSE-LABEL: div_sqrt_fabs_f32: 626; SSE: # %bb.0: 627; SSE-NEXT: mulss %xmm1, %xmm1 628; SSE-NEXT: mulss %xmm2, %xmm1 629; SSE-NEXT: xorps %xmm2, %xmm2 630; SSE-NEXT: rsqrtss %xmm1, %xmm2 631; SSE-NEXT: mulss %xmm2, %xmm1 632; SSE-NEXT: mulss %xmm2, %xmm1 633; SSE-NEXT: addss {{.*}}(%rip), %xmm1 634; SSE-NEXT: mulss {{.*}}(%rip), %xmm2 635; SSE-NEXT: mulss %xmm0, %xmm2 636; SSE-NEXT: mulss %xmm1, %xmm2 637; SSE-NEXT: movaps %xmm2, %xmm0 638; SSE-NEXT: retq 639; 640; AVX1-LABEL: div_sqrt_fabs_f32: 641; AVX1: # %bb.0: 642; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm1 643; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 644; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 645; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 646; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 647; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 648; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 649; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0 650; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 651; AVX1-NEXT: retq 652; 653; AVX512-LABEL: div_sqrt_fabs_f32: 654; AVX512: # %bb.0: 655; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm1 656; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 657; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 658; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 659; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 660; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 661; AVX512-NEXT: vmulss %xmm0, %xmm2, %xmm0 662; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 663; AVX512-NEXT: retq 664 %s = call fast float @llvm.sqrt.f32(float %z) 665 %a = call fast float @llvm.fabs.f32(float %y) 666 %m = fmul fast float %s, %a 667 %d = fdiv fast float %x, %m 668 ret float %d 669} 670 671; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z) 672 673define <4 x float> @div_sqrt_fabs_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) { 674; SSE-LABEL: div_sqrt_fabs_v4f32: 675; SSE: # %bb.0: 676; SSE-NEXT: mulps %xmm1, %xmm1 677; SSE-NEXT: mulps %xmm2, %xmm1 678; SSE-NEXT: rsqrtps %xmm1, %xmm2 679; SSE-NEXT: mulps %xmm2, %xmm1 680; SSE-NEXT: mulps %xmm2, %xmm1 681; SSE-NEXT: addps {{.*}}(%rip), %xmm1 682; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 683; SSE-NEXT: mulps %xmm1, %xmm2 684; SSE-NEXT: mulps %xmm2, %xmm0 685; SSE-NEXT: retq 686; 687; AVX1-LABEL: div_sqrt_fabs_v4f32: 688; AVX1: # %bb.0: 689; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm1 690; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 691; AVX1-NEXT: vrsqrtps %xmm1, %xmm2 692; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 693; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 694; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 695; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm2 696; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 697; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 698; AVX1-NEXT: retq 699; 700; AVX512-LABEL: div_sqrt_fabs_v4f32: 701; AVX512: # %bb.0: 702; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm1 703; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 704; AVX512-NEXT: vrsqrtps %xmm1, %xmm2 705; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 706; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 707; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 708; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 709; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 710; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 711; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 712; AVX512-NEXT: retq 713 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z) 714 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) 715 %m = fmul reassoc <4 x float> %a, %s 716 %d = fdiv reassoc arcp <4 x float> %x, %m 717 ret <4 x float> %d 718} 719 720; This has 'arcp' but does not have 'reassoc' FMF. 721; We allow converting the sqrt to an estimate, but 722; do not pull the divisor into the estimate. 723; x / (fabs(y) * sqrt(z)) --> x * rsqrt(z) / fabs(y) 724 725define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x float> %z) { 726; SSE-LABEL: div_sqrt_fabs_v4f32_fmf: 727; SSE: # %bb.0: 728; SSE-NEXT: rsqrtps %xmm2, %xmm3 729; SSE-NEXT: mulps %xmm3, %xmm2 730; SSE-NEXT: mulps %xmm3, %xmm2 731; SSE-NEXT: addps {{.*}}(%rip), %xmm2 732; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 733; SSE-NEXT: mulps %xmm2, %xmm3 734; SSE-NEXT: andps {{.*}}(%rip), %xmm1 735; SSE-NEXT: divps %xmm1, %xmm3 736; SSE-NEXT: mulps %xmm3, %xmm0 737; SSE-NEXT: retq 738; 739; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf: 740; AVX1: # %bb.0: 741; AVX1-NEXT: vrsqrtps %xmm2, %xmm3 742; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2 743; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2 744; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm2, %xmm2 745; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm3, %xmm3 746; AVX1-NEXT: vmulps %xmm2, %xmm3, %xmm2 747; AVX1-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 748; AVX1-NEXT: vdivps %xmm1, %xmm2, %xmm1 749; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 750; AVX1-NEXT: retq 751; 752; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf: 753; AVX512: # %bb.0: 754; AVX512-NEXT: vrsqrtps %xmm2, %xmm3 755; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 756; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm4 757; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2 758; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2 759; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 760; AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm2 761; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] 762; AVX512-NEXT: vmulps %xmm2, %xmm4, %xmm2 763; AVX512-NEXT: vandps %xmm3, %xmm1, %xmm1 764; AVX512-NEXT: vdivps %xmm1, %xmm2, %xmm1 765; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 766; AVX512-NEXT: retq 767 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z) 768 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) 769 %m = fmul <4 x float> %a, %s 770 %d = fdiv arcp <4 x float> %x, %m 771 ret <4 x float> %d 772} 773 774; No estimates for f64, so do not convert fabs into an fmul. 775 776define double @div_sqrt_fabs_f64(double %x, double %y, double %z) { 777; SSE-LABEL: div_sqrt_fabs_f64: 778; SSE: # %bb.0: 779; SSE-NEXT: sqrtsd %xmm2, %xmm2 780; SSE-NEXT: andpd {{.*}}(%rip), %xmm1 781; SSE-NEXT: mulsd %xmm2, %xmm1 782; SSE-NEXT: divsd %xmm1, %xmm0 783; SSE-NEXT: retq 784; 785; AVX-LABEL: div_sqrt_fabs_f64: 786; AVX: # %bb.0: 787; AVX-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2 788; AVX-NEXT: vandpd {{.*}}(%rip), %xmm1, %xmm1 789; AVX-NEXT: vmulsd %xmm1, %xmm2, %xmm1 790; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 791; AVX-NEXT: retq 792 %s = call fast double @llvm.sqrt.f64(double %z) 793 %a = call fast double @llvm.fabs.f64(double %y) 794 %m = fmul fast double %s, %a 795 %d = fdiv fast double %x, %m 796 ret double %d 797} 798 799; This is a special case for the general pattern above - 800; if the sqrt operand is the same as the other mul op, 801; then fabs may be omitted. 802; x / (y * sqrt(y)) --> x * rsqrt(y*y*y) 803 804define float @div_sqrt_f32(float %x, float %y) { 805; SSE-LABEL: div_sqrt_f32: 806; SSE: # %bb.0: 807; SSE-NEXT: movaps %xmm1, %xmm2 808; SSE-NEXT: mulss %xmm1, %xmm2 809; SSE-NEXT: mulss %xmm1, %xmm2 810; SSE-NEXT: xorps %xmm1, %xmm1 811; SSE-NEXT: rsqrtss %xmm2, %xmm1 812; SSE-NEXT: mulss %xmm1, %xmm2 813; SSE-NEXT: mulss %xmm1, %xmm2 814; SSE-NEXT: addss {{.*}}(%rip), %xmm2 815; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 816; SSE-NEXT: mulss %xmm0, %xmm1 817; SSE-NEXT: mulss %xmm2, %xmm1 818; SSE-NEXT: movaps %xmm1, %xmm0 819; SSE-NEXT: retq 820; 821; AVX1-LABEL: div_sqrt_f32: 822; AVX1: # %bb.0: 823; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm2 824; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 825; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 826; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 827; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 828; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 829; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 830; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0 831; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 832; AVX1-NEXT: retq 833; 834; AVX512-LABEL: div_sqrt_f32: 835; AVX512: # %bb.0: 836; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm2 837; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1 838; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 839; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 840; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 841; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 842; AVX512-NEXT: vmulss %xmm0, %xmm2, %xmm0 843; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 844; AVX512-NEXT: retq 845 %s = call fast float @llvm.sqrt.f32(float %y) 846 %m = fmul fast float %s, %y 847 %d = fdiv fast float %x, %m 848 ret float %d 849} 850 851; This is a special case for the general pattern above - 852; if the sqrt operand is the same as the other mul op, 853; then fabs may be omitted. 854; x / (y * sqrt(y)) --> x * rsqrt(y*y*y) 855 856define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) { 857; SSE-LABEL: div_sqrt_v4f32: 858; SSE: # %bb.0: 859; SSE-NEXT: movaps %xmm1, %xmm2 860; SSE-NEXT: mulps %xmm1, %xmm2 861; SSE-NEXT: mulps %xmm1, %xmm2 862; SSE-NEXT: rsqrtps %xmm2, %xmm1 863; SSE-NEXT: mulps %xmm1, %xmm2 864; SSE-NEXT: mulps %xmm1, %xmm2 865; SSE-NEXT: addps {{.*}}(%rip), %xmm2 866; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 867; SSE-NEXT: mulps %xmm2, %xmm1 868; SSE-NEXT: mulps %xmm1, %xmm0 869; SSE-NEXT: retq 870; 871; AVX1-LABEL: div_sqrt_v4f32: 872; AVX1: # %bb.0: 873; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm2 874; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 875; AVX1-NEXT: vrsqrtps %xmm1, %xmm2 876; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 877; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 878; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 879; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm2 880; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 881; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 882; AVX1-NEXT: retq 883; 884; AVX512-LABEL: div_sqrt_v4f32: 885; AVX512: # %bb.0: 886; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm2 887; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 888; AVX512-NEXT: vrsqrtps %xmm1, %xmm2 889; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 890; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 891; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 892; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 893; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 894; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 895; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 896; AVX512-NEXT: retq 897 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %y) 898 %m = fmul reassoc <4 x float> %y, %s 899 %d = fdiv reassoc arcp <4 x float> %x, %m 900 ret <4 x float> %d 901} 902 903define double @sqrt_fdiv_common_operand(double %x) nounwind { 904; SSE-LABEL: sqrt_fdiv_common_operand: 905; SSE: # %bb.0: 906; SSE-NEXT: sqrtsd %xmm0, %xmm0 907; SSE-NEXT: retq 908; 909; AVX-LABEL: sqrt_fdiv_common_operand: 910; AVX: # %bb.0: 911; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 912; AVX-NEXT: retq 913 %sqrt = call fast double @llvm.sqrt.f64(double %x) 914 %r = fdiv fast double %x, %sqrt 915 ret double %r 916} 917 918define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind { 919; SSE-LABEL: sqrt_fdiv_common_operand_vec: 920; SSE: # %bb.0: 921; SSE-NEXT: sqrtpd %xmm0, %xmm0 922; SSE-NEXT: retq 923; 924; AVX-LABEL: sqrt_fdiv_common_operand_vec: 925; AVX: # %bb.0: 926; AVX-NEXT: vsqrtpd %xmm0, %xmm0 927; AVX-NEXT: retq 928 %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) 929 %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt 930 ret <2 x double> %r 931} 932 933define double @sqrt_fdiv_common_operand_extra_use(double %x, double* %p) nounwind { 934; SSE-LABEL: sqrt_fdiv_common_operand_extra_use: 935; SSE: # %bb.0: 936; SSE-NEXT: sqrtsd %xmm0, %xmm0 937; SSE-NEXT: movsd %xmm0, (%rdi) 938; SSE-NEXT: retq 939; 940; AVX-LABEL: sqrt_fdiv_common_operand_extra_use: 941; AVX: # %bb.0: 942; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 943; AVX-NEXT: vmovsd %xmm0, (%rdi) 944; AVX-NEXT: retq 945 %sqrt = call fast double @llvm.sqrt.f64(double %x) 946 store double %sqrt, double* %p 947 %r = fdiv fast double %x, %sqrt 948 ret double %r 949} 950 951define double @sqrt_simplify_before_recip(double %x, double* %p) nounwind { 952; SSE-LABEL: sqrt_simplify_before_recip: 953; SSE: # %bb.0: 954; SSE-NEXT: sqrtsd %xmm0, %xmm0 955; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 956; SSE-NEXT: divsd %xmm0, %xmm1 957; SSE-NEXT: movsd %xmm1, (%rdi) 958; SSE-NEXT: retq 959; 960; AVX-LABEL: sqrt_simplify_before_recip: 961; AVX: # %bb.0: 962; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 963; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 964; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1 965; AVX-NEXT: vmovsd %xmm1, (%rdi) 966; AVX-NEXT: retq 967 %sqrt = tail call fast double @llvm.sqrt.f64(double %x) 968 %rsqrt = fdiv fast double 1.0, %sqrt 969 %sqrt_fast = fdiv fast double %x, %sqrt 970 store double %rsqrt, double* %p, align 8 971 ret double %sqrt_fast 972} 973 974define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, <2 x double>* %p) nounwind { 975; SSE-LABEL: sqrt_simplify_before_recip_vec: 976; SSE: # %bb.0: 977; SSE-NEXT: sqrtpd %xmm0, %xmm0 978; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] 979; SSE-NEXT: divpd %xmm0, %xmm1 980; SSE-NEXT: movupd %xmm1, (%rdi) 981; SSE-NEXT: retq 982; 983; AVX-LABEL: sqrt_simplify_before_recip_vec: 984; AVX: # %bb.0: 985; AVX-NEXT: vsqrtpd %xmm0, %xmm0 986; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] 987; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1 988; AVX-NEXT: vmovupd %xmm1, (%rdi) 989; AVX-NEXT: retq 990 %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) 991 %rsqrt = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt 992 %sqrt_fast = fdiv fast <2 x double> %x, %sqrt 993 store <2 x double> %rsqrt, <2 x double>* %p, align 8 994 ret <2 x double> %sqrt_fast 995} 996 997define double @sqrt_simplify_before_recip_order(double %x, double* %p) nounwind { 998; SSE-LABEL: sqrt_simplify_before_recip_order: 999; SSE: # %bb.0: 1000; SSE-NEXT: sqrtsd %xmm0, %xmm0 1001; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 1002; SSE-NEXT: divsd %xmm0, %xmm1 1003; SSE-NEXT: movsd %xmm1, (%rdi) 1004; SSE-NEXT: retq 1005; 1006; AVX-LABEL: sqrt_simplify_before_recip_order: 1007; AVX: # %bb.0: 1008; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 1009; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1010; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1 1011; AVX-NEXT: vmovsd %xmm1, (%rdi) 1012; AVX-NEXT: retq 1013 %sqrt = tail call fast double @llvm.sqrt.f64(double %x) 1014 %sqrt_fast = fdiv fast double %x, %sqrt 1015 %rsqrt = fdiv fast double 42.0, %sqrt 1016 store double %rsqrt, double* %p, align 8 1017 ret double %sqrt_fast 1018} 1019 1020attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" } 1021attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" } 1022attributes #2 = { nounwind readnone } 1023attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee" } 1024attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" } 1025