1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 8 9; These tests are identical to corresponding tests in the 'nnan' versions 10; of the files except that they use 'fast' FMF. If things are working as 11; expected, the 'nnan' codegen should be the same as 'fast'. 12 13; 14; vXf32 15; 16 17define float @test_v2f32(<2 x float> %a0) { 18; SSE2-LABEL: test_v2f32: 19; SSE2: # %bb.0: 20; SSE2-NEXT: movaps %xmm0, %xmm1 21; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 22; SSE2-NEXT: minss %xmm1, %xmm0 23; SSE2-NEXT: retq 24; 25; SSE41-LABEL: test_v2f32: 26; SSE41: # %bb.0: 27; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 28; SSE41-NEXT: minss %xmm1, %xmm0 29; SSE41-NEXT: retq 30; 31; AVX-LABEL: test_v2f32: 32; AVX: # %bb.0: 33; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 34; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 35; AVX-NEXT: retq 36; 37; AVX512-LABEL: test_v2f32: 38; AVX512: # %bb.0: 39; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 40; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 41; AVX512-NEXT: retq 42 %1 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0) 43 ret float %1 44} 45 46define float @test_v4f32(<4 x float> %a0) { 47; SSE2-LABEL: test_v4f32: 48; SSE2: # %bb.0: 49; SSE2-NEXT: movaps %xmm0, %xmm1 50; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 51; SSE2-NEXT: maxps %xmm1, %xmm0 52; SSE2-NEXT: movaps %xmm0, %xmm1 53; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 54; SSE2-NEXT: maxss %xmm1, %xmm0 55; SSE2-NEXT: retq 56; 57; SSE41-LABEL: test_v4f32: 58; SSE41: # %bb.0: 59; SSE41-NEXT: movaps %xmm0, %xmm1 60; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 61; SSE41-NEXT: maxps %xmm1, %xmm0 62; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 63; SSE41-NEXT: maxss %xmm1, %xmm0 64; SSE41-NEXT: retq 65; 66; AVX-LABEL: test_v4f32: 67; AVX: # %bb.0: 68; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 69; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 70; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 71; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 72; AVX-NEXT: retq 73; 74; AVX512-LABEL: test_v4f32: 75; AVX512: # %bb.0: 76; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 77; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 78; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 79; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 80; AVX512-NEXT: retq 81 %1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0) 82 ret float %1 83} 84 85define float @test_v8f32(<8 x float> %a0) { 86; SSE2-LABEL: test_v8f32: 87; SSE2: # %bb.0: 88; SSE2-NEXT: minps %xmm1, %xmm0 89; SSE2-NEXT: movaps %xmm0, %xmm1 90; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 91; SSE2-NEXT: minps %xmm1, %xmm0 92; SSE2-NEXT: movaps %xmm0, %xmm1 93; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 94; SSE2-NEXT: minss %xmm1, %xmm0 95; SSE2-NEXT: retq 96; 97; SSE41-LABEL: test_v8f32: 98; SSE41: # %bb.0: 99; SSE41-NEXT: minps %xmm1, %xmm0 100; SSE41-NEXT: movaps %xmm0, %xmm1 101; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 102; SSE41-NEXT: minps %xmm1, %xmm0 103; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 104; SSE41-NEXT: minss %xmm1, %xmm0 105; SSE41-NEXT: retq 106; 107; AVX-LABEL: test_v8f32: 108; AVX: # %bb.0: 109; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 110; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 111; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 112; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 113; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 114; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 115; AVX-NEXT: vzeroupper 116; AVX-NEXT: retq 117; 118; AVX512-LABEL: test_v8f32: 119; AVX512: # %bb.0: 120; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 121; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 122; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 123; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 124; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 125; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 126; AVX512-NEXT: vzeroupper 127; AVX512-NEXT: retq 128 %1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0) 129 ret float %1 130} 131 132define float @test_v16f32(<16 x float> %a0) { 133; SSE2-LABEL: test_v16f32: 134; SSE2: # %bb.0: 135; SSE2-NEXT: maxps %xmm3, %xmm1 136; SSE2-NEXT: maxps %xmm2, %xmm0 137; SSE2-NEXT: maxps %xmm1, %xmm0 138; SSE2-NEXT: movaps %xmm0, %xmm1 139; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 140; SSE2-NEXT: maxps %xmm1, %xmm0 141; SSE2-NEXT: movaps %xmm0, %xmm1 142; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 143; SSE2-NEXT: maxss %xmm1, %xmm0 144; SSE2-NEXT: retq 145; 146; SSE41-LABEL: test_v16f32: 147; SSE41: # %bb.0: 148; SSE41-NEXT: maxps %xmm3, %xmm1 149; SSE41-NEXT: maxps %xmm2, %xmm0 150; SSE41-NEXT: maxps %xmm1, %xmm0 151; SSE41-NEXT: movaps %xmm0, %xmm1 152; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 153; SSE41-NEXT: maxps %xmm1, %xmm0 154; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 155; SSE41-NEXT: maxss %xmm1, %xmm0 156; SSE41-NEXT: retq 157; 158; AVX-LABEL: test_v16f32: 159; AVX: # %bb.0: 160; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 161; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 162; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 163; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 164; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 165; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 166; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 167; AVX-NEXT: vzeroupper 168; AVX-NEXT: retq 169; 170; AVX512-LABEL: test_v16f32: 171; AVX512: # %bb.0: 172; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 173; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 174; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 175; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 176; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 177; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 178; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 179; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 180; AVX512-NEXT: vzeroupper 181; AVX512-NEXT: retq 182 %1 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0) 183 ret float %1 184} 185 186; 187; vXf64 188; 189 190define double @test_v2f64(<2 x double> %a0) { 191; SSE-LABEL: test_v2f64: 192; SSE: # %bb.0: 193; SSE-NEXT: movapd %xmm0, %xmm1 194; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 195; SSE-NEXT: minsd %xmm1, %xmm0 196; SSE-NEXT: retq 197; 198; AVX-LABEL: test_v2f64: 199; AVX: # %bb.0: 200; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 201; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 202; AVX-NEXT: retq 203; 204; AVX512-LABEL: test_v2f64: 205; AVX512: # %bb.0: 206; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 207; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 208; AVX512-NEXT: retq 209 %1 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0) 210 ret double %1 211} 212 213define double @test_v4f64(<4 x double> %a0) { 214; SSE-LABEL: test_v4f64: 215; SSE: # %bb.0: 216; SSE-NEXT: maxpd %xmm1, %xmm0 217; SSE-NEXT: movapd %xmm0, %xmm1 218; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 219; SSE-NEXT: maxsd %xmm1, %xmm0 220; SSE-NEXT: retq 221; 222; AVX-LABEL: test_v4f64: 223; AVX: # %bb.0: 224; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 225; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 226; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 227; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 228; AVX-NEXT: vzeroupper 229; AVX-NEXT: retq 230; 231; AVX512-LABEL: test_v4f64: 232; AVX512: # %bb.0: 233; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 234; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 235; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 236; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 237; AVX512-NEXT: vzeroupper 238; AVX512-NEXT: retq 239 %1 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0) 240 ret double %1 241} 242 243define double @test_v8f64(<8 x double> %a0) { 244; SSE-LABEL: test_v8f64: 245; SSE: # %bb.0: 246; SSE-NEXT: minpd %xmm3, %xmm1 247; SSE-NEXT: minpd %xmm2, %xmm0 248; SSE-NEXT: minpd %xmm1, %xmm0 249; SSE-NEXT: movapd %xmm0, %xmm1 250; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 251; SSE-NEXT: minsd %xmm1, %xmm0 252; SSE-NEXT: retq 253; 254; AVX-LABEL: test_v8f64: 255; AVX: # %bb.0: 256; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 257; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 258; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 259; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 260; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 261; AVX-NEXT: vzeroupper 262; AVX-NEXT: retq 263; 264; AVX512-LABEL: test_v8f64: 265; AVX512: # %bb.0: 266; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 267; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 268; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 269; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 270; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 271; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 272; AVX512-NEXT: vzeroupper 273; AVX512-NEXT: retq 274 %1 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0) 275 ret double %1 276} 277 278define double @test_v16f64(<16 x double> %a0) { 279; SSE-LABEL: test_v16f64: 280; SSE: # %bb.0: 281; SSE-NEXT: maxpd %xmm6, %xmm2 282; SSE-NEXT: maxpd %xmm4, %xmm0 283; SSE-NEXT: maxpd %xmm2, %xmm0 284; SSE-NEXT: maxpd %xmm7, %xmm3 285; SSE-NEXT: maxpd %xmm5, %xmm1 286; SSE-NEXT: maxpd %xmm3, %xmm1 287; SSE-NEXT: maxpd %xmm1, %xmm0 288; SSE-NEXT: movapd %xmm0, %xmm1 289; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 290; SSE-NEXT: maxsd %xmm1, %xmm0 291; SSE-NEXT: retq 292; 293; AVX-LABEL: test_v16f64: 294; AVX: # %bb.0: 295; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 296; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 297; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 298; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 299; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 300; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 301; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 302; AVX-NEXT: vzeroupper 303; AVX-NEXT: retq 304; 305; AVX512-LABEL: test_v16f64: 306; AVX512: # %bb.0: 307; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 308; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 309; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 310; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 311; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 312; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 313; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 314; AVX512-NEXT: vzeroupper 315; AVX512-NEXT: retq 316 %1 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0) 317 ret double %1 318} 319 320declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) 321declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 322declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) 323declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) 324 325declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) 326declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) 327declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) 328declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) 329