1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 8 9; 10; vXf32 11; 12 13define float @test_v2f32(<2 x float> %a0) { 14; SSE2-LABEL: test_v2f32: 15; SSE2: # %bb.0: 16; SSE2-NEXT: movaps %xmm0, %xmm1 17; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 18; SSE2-NEXT: maxss %xmm1, %xmm0 19; SSE2-NEXT: retq 20; 21; SSE41-LABEL: test_v2f32: 22; SSE41: # %bb.0: 23; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 24; SSE41-NEXT: maxss %xmm1, %xmm0 25; SSE41-NEXT: retq 26; 27; AVX-LABEL: test_v2f32: 28; AVX: # %bb.0: 29; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 30; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 31; AVX-NEXT: retq 32; 33; AVX512-LABEL: test_v2f32: 34; AVX512: # %bb.0: 35; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 36; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 37; AVX512-NEXT: retq 38 %1 = call nnan float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a0) 39 ret float %1 40} 41 42define float @test_v4f32(<4 x float> %a0) { 43; SSE2-LABEL: test_v4f32: 44; SSE2: # %bb.0: 45; SSE2-NEXT: movaps %xmm0, %xmm1 46; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] 47; SSE2-NEXT: movaps %xmm0, %xmm2 48; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 49; SSE2-NEXT: movaps %xmm0, %xmm3 50; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] 51; SSE2-NEXT: maxss %xmm3, %xmm0 52; SSE2-NEXT: maxss %xmm2, %xmm0 53; SSE2-NEXT: maxss %xmm1, %xmm0 54; SSE2-NEXT: retq 55; 56; SSE41-LABEL: test_v4f32: 57; SSE41: # %bb.0: 58; SSE41-NEXT: movaps %xmm0, %xmm1 59; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] 60; SSE41-NEXT: movaps %xmm0, %xmm2 61; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 62; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 63; SSE41-NEXT: maxss %xmm3, %xmm0 64; SSE41-NEXT: maxss %xmm2, %xmm0 65; SSE41-NEXT: maxss %xmm1, %xmm0 66; SSE41-NEXT: retq 67; 68; AVX-LABEL: test_v4f32: 69; AVX: # %bb.0: 70; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] 71; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 72; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 73; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm0 74; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm0 75; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 76; AVX-NEXT: retq 77; 78; AVX512-LABEL: test_v4f32: 79; AVX512: # %bb.0: 80; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] 81; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 82; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 83; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 84; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 85; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 86; AVX512-NEXT: retq 87 %1 = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0) 88 ret float %1 89} 90 91define float @test_v8f32(<8 x float> %a0) { 92; SSE2-LABEL: test_v8f32: 93; SSE2: # %bb.0: 94; SSE2-NEXT: maxps %xmm1, %xmm0 95; SSE2-NEXT: movaps %xmm0, %xmm2 96; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 97; SSE2-NEXT: movaps %xmm0, %xmm1 98; SSE2-NEXT: maxss %xmm2, %xmm1 99; SSE2-NEXT: movaps %xmm0, %xmm2 100; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 101; SSE2-NEXT: maxss %xmm2, %xmm1 102; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 103; SSE2-NEXT: maxss %xmm0, %xmm1 104; SSE2-NEXT: movaps %xmm1, %xmm0 105; SSE2-NEXT: retq 106; 107; SSE41-LABEL: test_v8f32: 108; SSE41: # %bb.0: 109; SSE41-NEXT: maxps %xmm1, %xmm0 110; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 111; SSE41-NEXT: movaps %xmm0, %xmm1 112; SSE41-NEXT: maxss %xmm2, %xmm1 113; SSE41-NEXT: movaps %xmm0, %xmm2 114; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 115; SSE41-NEXT: maxss %xmm2, %xmm1 116; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 117; SSE41-NEXT: maxss %xmm0, %xmm1 118; SSE41-NEXT: movaps %xmm1, %xmm0 119; SSE41-NEXT: retq 120; 121; AVX-LABEL: test_v8f32: 122; AVX: # %bb.0: 123; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 124; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 125; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 126; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 127; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] 128; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] 129; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] 130; AVX-NEXT: vmaxss %xmm7, %xmm0, %xmm0 131; AVX-NEXT: vmaxss %xmm6, %xmm0, %xmm0 132; AVX-NEXT: vmaxss %xmm5, %xmm0, %xmm0 133; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 134; AVX-NEXT: vmaxss %xmm4, %xmm0, %xmm0 135; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm0 136; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm0 137; AVX-NEXT: vzeroupper 138; AVX-NEXT: retq 139; 140; AVX512-LABEL: test_v8f32: 141; AVX512: # %bb.0: 142; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 143; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] 144; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 145; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 146; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] 147; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] 148; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] 149; AVX512-NEXT: vmaxss %xmm7, %xmm0, %xmm0 150; AVX512-NEXT: vmaxss %xmm6, %xmm0, %xmm0 151; AVX512-NEXT: vmaxss %xmm5, %xmm0, %xmm0 152; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 153; AVX512-NEXT: vmaxss %xmm4, %xmm0, %xmm0 154; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 155; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 156; AVX512-NEXT: vzeroupper 157; AVX512-NEXT: retq 158 %1 = call nnan float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0) 159 ret float %1 160} 161 162define float @test_v16f32(<16 x float> %a0) { 163; SSE2-LABEL: test_v16f32: 164; SSE2: # %bb.0: 165; SSE2-NEXT: maxps %xmm3, %xmm1 166; SSE2-NEXT: maxps %xmm2, %xmm0 167; SSE2-NEXT: maxps %xmm1, %xmm0 168; SSE2-NEXT: movaps %xmm0, %xmm2 169; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 170; SSE2-NEXT: movaps %xmm0, %xmm1 171; SSE2-NEXT: maxss %xmm2, %xmm1 172; SSE2-NEXT: movaps %xmm0, %xmm2 173; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 174; SSE2-NEXT: maxss %xmm2, %xmm1 175; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 176; SSE2-NEXT: maxss %xmm0, %xmm1 177; SSE2-NEXT: movaps %xmm1, %xmm0 178; SSE2-NEXT: retq 179; 180; SSE41-LABEL: test_v16f32: 181; SSE41: # %bb.0: 182; SSE41-NEXT: maxps %xmm3, %xmm1 183; SSE41-NEXT: maxps %xmm2, %xmm0 184; SSE41-NEXT: maxps %xmm1, %xmm0 185; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 186; SSE41-NEXT: movaps %xmm0, %xmm1 187; SSE41-NEXT: maxss %xmm2, %xmm1 188; SSE41-NEXT: movaps %xmm0, %xmm2 189; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 190; SSE41-NEXT: maxss %xmm2, %xmm1 191; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 192; SSE41-NEXT: maxss %xmm0, %xmm1 193; SSE41-NEXT: movaps %xmm1, %xmm0 194; SSE41-NEXT: retq 195; 196; AVX-LABEL: test_v16f32: 197; AVX: # %bb.0: 198; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 199; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 200; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 201; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 202; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 203; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 204; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 205; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 206; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm1 207; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 208; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 209; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 210; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 211; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 212; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 213; AVX-NEXT: vzeroupper 214; AVX-NEXT: retq 215; 216; AVX512-LABEL: test_v16f32: 217; AVX512: # %bb.0: 218; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 219; AVX512-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] 220; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm1[1,0] 221; AVX512-NEXT: vmovshdup {{.*#+}} xmm10 = xmm1[1,1,3,3] 222; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm5 223; AVX512-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3] 224; AVX512-NEXT: vpermilpd {{.*#+}} xmm12 = xmm5[1,0] 225; AVX512-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3] 226; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 227; AVX512-NEXT: vpermilps {{.*#+}} xmm14 = xmm3[3,3,3,3] 228; AVX512-NEXT: vpermilpd {{.*#+}} xmm15 = xmm3[1,0] 229; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm3[1,1,3,3] 230; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] 231; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 232; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] 233; AVX512-NEXT: vmaxss %xmm6, %xmm0, %xmm0 234; AVX512-NEXT: vmaxss %xmm4, %xmm0, %xmm0 235; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 236; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 237; AVX512-NEXT: vmaxss %xmm7, %xmm0, %xmm0 238; AVX512-NEXT: vmaxss %xmm15, %xmm0, %xmm0 239; AVX512-NEXT: vmaxss %xmm14, %xmm0, %xmm0 240; AVX512-NEXT: vmaxss %xmm5, %xmm0, %xmm0 241; AVX512-NEXT: vmaxss %xmm13, %xmm0, %xmm0 242; AVX512-NEXT: vmaxss %xmm12, %xmm0, %xmm0 243; AVX512-NEXT: vmaxss %xmm11, %xmm0, %xmm0 244; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 245; AVX512-NEXT: vmaxss %xmm10, %xmm0, %xmm0 246; AVX512-NEXT: vmaxss %xmm9, %xmm0, %xmm0 247; AVX512-NEXT: vmaxss %xmm8, %xmm0, %xmm0 248; AVX512-NEXT: vzeroupper 249; AVX512-NEXT: retq 250 %1 = call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0) 251 ret float %1 252} 253 254; 255; vXf64 256; 257 258define double @test_v2f64(<2 x double> %a0) { 259; SSE-LABEL: test_v2f64: 260; SSE: # %bb.0: 261; SSE-NEXT: movapd %xmm0, %xmm1 262; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 263; SSE-NEXT: maxsd %xmm1, %xmm0 264; SSE-NEXT: retq 265; 266; AVX-LABEL: test_v2f64: 267; AVX: # %bb.0: 268; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 269; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 270; AVX-NEXT: retq 271; 272; AVX512-LABEL: test_v2f64: 273; AVX512: # %bb.0: 274; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 275; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 276; AVX512-NEXT: retq 277 %1 = call nnan double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0) 278 ret double %1 279} 280 281define double @test_v3f64(<3 x double> %a0) { 282; SSE2-LABEL: test_v3f64: 283; SSE2: # %bb.0: 284; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 285; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] 286; SSE2-NEXT: maxpd %xmm2, %xmm0 287; SSE2-NEXT: movapd %xmm0, %xmm1 288; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 289; SSE2-NEXT: maxsd %xmm1, %xmm0 290; SSE2-NEXT: retq 291; 292; SSE41-LABEL: test_v3f64: 293; SSE41: # %bb.0: 294; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 295; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] 296; SSE41-NEXT: maxpd %xmm2, %xmm0 297; SSE41-NEXT: movapd %xmm0, %xmm1 298; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 299; SSE41-NEXT: maxsd %xmm1, %xmm0 300; SSE41-NEXT: retq 301; 302; AVX-LABEL: test_v3f64: 303; AVX: # %bb.0: 304; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 305; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 306; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 307; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 308; AVX-NEXT: vzeroupper 309; AVX-NEXT: retq 310; 311; AVX512-LABEL: test_v3f64: 312; AVX512: # %bb.0: 313; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 314; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 315; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 316; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 317; AVX512-NEXT: vzeroupper 318; AVX512-NEXT: retq 319 %1 = call nnan double @llvm.vector.reduce.fmax.v3f64(<3 x double> %a0) 320 ret double %1 321} 322 323define double @test_v4f64(<4 x double> %a0) { 324; SSE-LABEL: test_v4f64: 325; SSE: # %bb.0: 326; SSE-NEXT: maxpd %xmm1, %xmm0 327; SSE-NEXT: movapd %xmm0, %xmm1 328; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 329; SSE-NEXT: maxsd %xmm1, %xmm0 330; SSE-NEXT: retq 331; 332; AVX-LABEL: test_v4f64: 333; AVX: # %bb.0: 334; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 335; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 336; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 337; AVX-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 338; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 339; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 340; AVX-NEXT: vzeroupper 341; AVX-NEXT: retq 342; 343; AVX512-LABEL: test_v4f64: 344; AVX512: # %bb.0: 345; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 346; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 347; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 348; AVX512-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 349; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 350; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 351; AVX512-NEXT: vzeroupper 352; AVX512-NEXT: retq 353 %1 = call nnan double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0) 354 ret double %1 355} 356 357define double @test_v8f64(<8 x double> %a0) { 358; SSE-LABEL: test_v8f64: 359; SSE: # %bb.0: 360; SSE-NEXT: maxpd %xmm3, %xmm1 361; SSE-NEXT: maxpd %xmm2, %xmm0 362; SSE-NEXT: maxpd %xmm1, %xmm0 363; SSE-NEXT: movapd %xmm0, %xmm1 364; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 365; SSE-NEXT: maxsd %xmm1, %xmm0 366; SSE-NEXT: retq 367; 368; AVX-LABEL: test_v8f64: 369; AVX: # %bb.0: 370; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 371; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 372; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 373; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 374; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 375; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 376; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 377; AVX-NEXT: vzeroupper 378; AVX-NEXT: retq 379; 380; AVX512-LABEL: test_v8f64: 381; AVX512: # %bb.0: 382; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 383; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 384; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 385; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 386; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 387; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] 388; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] 389; AVX512-NEXT: vmaxsd %xmm7, %xmm0, %xmm0 390; AVX512-NEXT: vmaxsd %xmm5, %xmm0, %xmm0 391; AVX512-NEXT: vmaxsd %xmm6, %xmm0, %xmm0 392; AVX512-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 393; AVX512-NEXT: vmaxsd %xmm4, %xmm0, %xmm0 394; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 395; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 396; AVX512-NEXT: vzeroupper 397; AVX512-NEXT: retq 398 %1 = call nnan double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0) 399 ret double %1 400} 401 402define double @test_v16f64(<16 x double> %a0) { 403; SSE-LABEL: test_v16f64: 404; SSE: # %bb.0: 405; SSE-NEXT: maxpd %xmm7, %xmm3 406; SSE-NEXT: maxpd %xmm5, %xmm1 407; SSE-NEXT: maxpd %xmm3, %xmm1 408; SSE-NEXT: maxpd %xmm6, %xmm2 409; SSE-NEXT: maxpd %xmm4, %xmm0 410; SSE-NEXT: maxpd %xmm2, %xmm0 411; SSE-NEXT: maxpd %xmm1, %xmm0 412; SSE-NEXT: movapd %xmm0, %xmm1 413; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 414; SSE-NEXT: maxsd %xmm1, %xmm0 415; SSE-NEXT: retq 416; 417; AVX-LABEL: test_v16f64: 418; AVX: # %bb.0: 419; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 420; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 421; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 422; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 423; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 424; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 425; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 426; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 427; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 428; AVX-NEXT: vzeroupper 429; AVX-NEXT: retq 430; 431; AVX512-LABEL: test_v16f64: 432; AVX512: # %bb.0: 433; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 434; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 435; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 436; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 437; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 438; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 439; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 440; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 441; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 442; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 443; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 444; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 445; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 446; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 447; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 448; AVX512-NEXT: vzeroupper 449; AVX512-NEXT: retq 450 %1 = call nnan double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0) 451 ret double %1 452} 453 454define half @test_v2f16(<2 x half> %a0) nounwind { 455; SSE-LABEL: test_v2f16: 456; SSE: # %bb.0: 457; SSE-NEXT: pushq %rbx 458; SSE-NEXT: subq $16, %rsp 459; SSE-NEXT: movl %edi, %ebx 460; SSE-NEXT: movzwl %si, %edi 461; SSE-NEXT: callq __gnu_h2f_ieee 462; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 463; SSE-NEXT: movzwl %bx, %edi 464; SSE-NEXT: callq __gnu_h2f_ieee 465; SSE-NEXT: movaps %xmm0, %xmm1 466; SSE-NEXT: cmpunordss %xmm0, %xmm1 467; SSE-NEXT: movaps %xmm1, %xmm2 468; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload 469; SSE-NEXT: andps %xmm3, %xmm2 470; SSE-NEXT: maxss %xmm0, %xmm3 471; SSE-NEXT: andnps %xmm3, %xmm1 472; SSE-NEXT: orps %xmm2, %xmm1 473; SSE-NEXT: movaps %xmm1, %xmm0 474; SSE-NEXT: callq __gnu_f2h_ieee 475; SSE-NEXT: addq $16, %rsp 476; SSE-NEXT: popq %rbx 477; SSE-NEXT: retq 478; 479; AVX-LABEL: test_v2f16: 480; AVX: # %bb.0: 481; AVX-NEXT: pushq %rbx 482; AVX-NEXT: subq $16, %rsp 483; AVX-NEXT: movl %esi, %ebx 484; AVX-NEXT: movzwl %di, %edi 485; AVX-NEXT: callq __gnu_h2f_ieee 486; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 487; AVX-NEXT: movzwl %bx, %edi 488; AVX-NEXT: callq __gnu_h2f_ieee 489; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload 490; AVX-NEXT: # xmm2 = mem[0],zero,zero,zero 491; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1 492; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2 493; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 494; AVX-NEXT: callq __gnu_f2h_ieee 495; AVX-NEXT: addq $16, %rsp 496; AVX-NEXT: popq %rbx 497; AVX-NEXT: retq 498; 499; AVX512-LABEL: test_v2f16: 500; AVX512: # %bb.0: 501; AVX512-NEXT: movzwl %di, %eax 502; AVX512-NEXT: vmovd %eax, %xmm0 503; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 504; AVX512-NEXT: movzwl %si, %eax 505; AVX512-NEXT: vmovd %eax, %xmm1 506; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 507; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm2 508; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 509; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 510; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 511; AVX512-NEXT: vmovd %xmm0, %eax 512; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 513; AVX512-NEXT: retq 514 %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0) 515 ret half %1 516} 517declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) 518declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 519declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) 520declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) 521 522declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) 523declare double @llvm.vector.reduce.fmax.v3f64(<3 x double>) 524declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) 525declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) 526declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) 527 528declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>) 529