1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 4 5define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 6; AVX1-LABEL: shuffle_v8f32_45670123: 7; AVX1: # %bb.0: # %entry 8; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 9; AVX1-NEXT: retq 10; 11; AVX2-LABEL: shuffle_v8f32_45670123: 12; AVX2: # %bb.0: # %entry 13; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 14; AVX2-NEXT: retq 15entry: 16 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 17 ret <8 x float> %shuffle 18} 19 20define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 21; AVX1-LABEL: shuffle_v8f32_45670123_mem: 22; AVX1: # %bb.0: # %entry 23; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] 24; AVX1-NEXT: retq 25; 26; AVX2-LABEL: shuffle_v8f32_45670123_mem: 27; AVX2: # %bb.0: # %entry 28; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,0,1] 29; AVX2-NEXT: retq 30entry: 31 %a = load <8 x float>, <8 x float>* %pa 32 %b = load <8 x float>, <8 x float>* %pb 33 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 34 ret <8 x float> %shuffle 35} 36 37define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 38; ALL-LABEL: shuffle_v8f32_0123cdef: 39; ALL: # %bb.0: # %entry 40; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 41; ALL-NEXT: retq 42entry: 43 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 44 ret <8 x float> %shuffle 45} 46 47define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 48; AVX1-LABEL: shuffle_v8f32_01230123: 49; AVX1: # %bb.0: # %entry 50; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 51; AVX1-NEXT: retq 52; 53; AVX2-LABEL: shuffle_v8f32_01230123: 54; AVX2: # %bb.0: # %entry 55; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 56; AVX2-NEXT: retq 57entry: 58 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 59 ret <8 x float> %shuffle 60} 61 62define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 63; AVX1-LABEL: shuffle_v8f32_01230123_mem: 64; AVX1: # %bb.0: # %entry 65; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1] 66; AVX1-NEXT: retq 67; 68; AVX2-LABEL: shuffle_v8f32_01230123_mem: 69; AVX2: # %bb.0: # %entry 70; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,1] 71; AVX2-NEXT: retq 72entry: 73 %a = load <8 x float>, <8 x float>* %pa 74 %b = load <8 x float>, <8 x float>* %pb 75 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 76 ret <8 x float> %shuffle 77} 78 79define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 80; AVX1-LABEL: shuffle_v8f32_45674567: 81; AVX1: # %bb.0: # %entry 82; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 83; AVX1-NEXT: retq 84; 85; AVX2-LABEL: shuffle_v8f32_45674567: 86; AVX2: # %bb.0: # %entry 87; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 88; AVX2-NEXT: retq 89entry: 90 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 91 ret <8 x float> %shuffle 92} 93 94define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 95; AVX1-LABEL: shuffle_v8f32_45674567_mem: 96; AVX1: # %bb.0: # %entry 97; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] 98; AVX1-NEXT: retq 99; 100; AVX2-LABEL: shuffle_v8f32_45674567_mem: 101; AVX2: # %bb.0: # %entry 102; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,2,3] 103; AVX2-NEXT: retq 104entry: 105 %a = load <8 x float>, <8 x float>* %pa 106 %b = load <8 x float>, <8 x float>* %pb 107 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 108 ret <8 x float> %shuffle 109} 110 111define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 112; AVX1-LABEL: shuffle_v32i8_2323: 113; AVX1: # %bb.0: # %entry 114; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 115; AVX1-NEXT: retq 116; 117; AVX2-LABEL: shuffle_v32i8_2323: 118; AVX2: # %bb.0: # %entry 119; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 120; AVX2-NEXT: retq 121entry: 122 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 123 ret <32 x i8> %shuffle 124} 125 126define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 127; AVX1-LABEL: shuffle_v32i8_2323_domain: 128; AVX1: # %bb.0: # %entry 129; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 130; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 131; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 132; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 133; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 134; AVX1-NEXT: retq 135; 136; AVX2-LABEL: shuffle_v32i8_2323_domain: 137; AVX2: # %bb.0: # %entry 138; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 139; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 140; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 141; AVX2-NEXT: retq 142entry: 143 ; add forces execution domain 144 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 145 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 146 ret <32 x i8> %shuffle 147} 148 149define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 150; ALL-LABEL: shuffle_v4i64_6701: 151; ALL: # %bb.0: # %entry 152; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 153; ALL-NEXT: retq 154entry: 155 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 156 ret <4 x i64> %shuffle 157} 158 159define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 160; AVX1-LABEL: shuffle_v4i64_6701_domain: 161; AVX1: # %bb.0: # %entry 162; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 163; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 164; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 165; AVX1-NEXT: retq 166; 167; AVX2-LABEL: shuffle_v4i64_6701_domain: 168; AVX2: # %bb.0: # %entry 169; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 170; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 171; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 172; AVX2-NEXT: retq 173entry: 174 ; add forces execution domain 175 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 176 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 177 ret <4 x i64> %shuffle 178} 179 180define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { 181; AVX1-LABEL: shuffle_v8i32_u5u7cdef: 182; AVX1: # %bb.0: # %entry 183; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 184; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 185; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 186; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 187; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 188; AVX1-NEXT: retq 189; 190; AVX2-LABEL: shuffle_v8i32_u5u7cdef: 191; AVX2: # %bb.0: # %entry 192; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 193; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 194; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 195; AVX2-NEXT: retq 196entry: 197 ; add forces execution domain 198 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 199 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> 200 ret <8 x i32> %shuffle 201} 202 203define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { 204; AVX1-LABEL: shuffle_v16i16_4501: 205; AVX1: # %bb.0: # %entry 206; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 207; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 208; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 209; AVX1-NEXT: retq 210; 211; AVX2-LABEL: shuffle_v16i16_4501: 212; AVX2: # %bb.0: # %entry 213; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 214; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 215; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 216; AVX2-NEXT: retq 217entry: 218 ; add forces execution domain 219 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 220 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 221 ret <16 x i16> %shuffle 222} 223 224define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { 225; AVX1-LABEL: shuffle_v16i16_4501_mem: 226; AVX1: # %bb.0: # %entry 227; AVX1-NEXT: vmovdqa (%rdi), %xmm0 228; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 229; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 230; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] 231; AVX1-NEXT: retq 232; 233; AVX2-LABEL: shuffle_v16i16_4501_mem: 234; AVX2: # %bb.0: # %entry 235; AVX2-NEXT: vmovdqa (%rdi), %ymm0 236; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 237; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 238; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] 239; AVX2-NEXT: retq 240entry: 241 %c = load <16 x i16>, <16 x i16>* %a 242 %d = load <16 x i16>, <16 x i16>* %b 243 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 244 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 245 ret <16 x i16> %shuffle 246} 247 248;;;; Cases with undef indicies mixed in the mask 249 250define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 251; ALL-LABEL: shuffle_v8f32_uu67u9ub: 252; ALL: # %bb.0: # %entry 253; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 254; ALL-NEXT: retq 255entry: 256 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> 257 ret <8 x float> %shuffle 258} 259 260define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 261; AVX1-LABEL: shuffle_v8f32_uu67uu67: 262; AVX1: # %bb.0: # %entry 263; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 264; AVX1-NEXT: retq 265; 266; AVX2-LABEL: shuffle_v8f32_uu67uu67: 267; AVX2: # %bb.0: # %entry 268; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] 269; AVX2-NEXT: retq 270entry: 271 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 272 ret <8 x float> %shuffle 273} 274 275define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 276; ALL-LABEL: shuffle_v8f32_uu67uuab: 277; ALL: # %bb.0: # %entry 278; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 279; ALL-NEXT: retq 280entry: 281 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11> 282 ret <8 x float> %shuffle 283} 284 285define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 286; ALL-LABEL: shuffle_v8f32_uu67uuef: 287; ALL: # %bb.0: # %entry 288; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 289; ALL-NEXT: retq 290entry: 291 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 292 ret <8 x float> %shuffle 293} 294 295define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 296; AVX1-LABEL: shuffle_v8f32_uu674567: 297; AVX1: # %bb.0: # %entry 298; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 299; AVX1-NEXT: retq 300; 301; AVX2-LABEL: shuffle_v8f32_uu674567: 302; AVX2: # %bb.0: # %entry 303; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] 304; AVX2-NEXT: retq 305entry: 306 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 307 ret <8 x float> %shuffle 308} 309 310define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 311; ALL-LABEL: shuffle_v8f32_uu6789ab: 312; ALL: # %bb.0: # %entry 313; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 314; ALL-NEXT: retq 315entry: 316 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 317 ret <8 x float> %shuffle 318} 319 320define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 321; AVX1-LABEL: shuffle_v8f32_4567uu67: 322; AVX1: # %bb.0: # %entry 323; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 324; AVX1-NEXT: retq 325; 326; AVX2-LABEL: shuffle_v8f32_4567uu67: 327; AVX2: # %bb.0: # %entry 328; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 329; AVX2-NEXT: retq 330entry: 331 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 332 ret <8 x float> %shuffle 333} 334 335define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 336; ALL-LABEL: shuffle_v8f32_4567uuef: 337; ALL: # %bb.0: # %entry 338; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 339; ALL-NEXT: retq 340entry: 341 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 342 ret <8 x float> %shuffle 343} 344 345;;;; Cases we must not select vperm2f128 346 347define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 348; ALL-LABEL: shuffle_v8f32_uu67ucuf: 349; ALL: # %bb.0: # %entry 350; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 351; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] 352; ALL-NEXT: retq 353entry: 354 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> 355 ret <8 x float> %shuffle 356} 357 358;; Test zero mask generation. 359;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984 360;; Prefer xor+vblendpd over vperm2f128 because that has better performance, 361;; unless building for optsize where we should still use vperm2f128. 362 363define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) { 364; ALL-LABEL: shuffle_v4f64_zz01: 365; ALL: # %bb.0: 366; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 367; ALL-NEXT: retq 368 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 369 ret <4 x double> %s 370} 371define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize { 372; ALL-LABEL: shuffle_v4f64_zz01_optsize: 373; ALL: # %bb.0: 374; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 375; ALL-NEXT: retq 376 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 377 ret <4 x double> %s 378} 379 380define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) { 381; ALL-LABEL: shuffle_v4f64_zz23: 382; ALL: # %bb.0: 383; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 384; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 385; ALL-NEXT: retq 386 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 387 ret <4 x double> %s 388} 389define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize { 390; ALL-LABEL: shuffle_v4f64_zz23_optsize: 391; ALL: # %bb.0: 392; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] 393; ALL-NEXT: retq 394 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 395 ret <4 x double> %s 396} 397define <4 x double> @shuffle_v4f64_zz23_pgso(<4 x double> %a) !prof !14 { 398; ALL-LABEL: shuffle_v4f64_zz23_pgso: 399; ALL: # %bb.0: 400; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] 401; ALL-NEXT: retq 402 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 403 ret <4 x double> %s 404} 405 406define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) { 407; ALL-LABEL: shuffle_v4f64_zz45: 408; ALL: # %bb.0: 409; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 410; ALL-NEXT: retq 411 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 412 ret <4 x double> %s 413} 414define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize { 415; ALL-LABEL: shuffle_v4f64_zz45_optsize: 416; ALL: # %bb.0: 417; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 418; ALL-NEXT: retq 419 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 420 ret <4 x double> %s 421} 422 423define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) { 424; ALL-LABEL: shuffle_v4f64_zz67: 425; ALL: # %bb.0: 426; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 427; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 428; ALL-NEXT: retq 429 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 430 ret <4 x double> %s 431} 432define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize { 433; ALL-LABEL: shuffle_v4f64_zz67_optsize: 434; ALL: # %bb.0: 435; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] 436; ALL-NEXT: retq 437 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 438 ret <4 x double> %s 439} 440define <4 x double> @shuffle_v4f64_zz67_pgso(<4 x double> %a) !prof !14 { 441; ALL-LABEL: shuffle_v4f64_zz67_pgso: 442; ALL: # %bb.0: 443; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] 444; ALL-NEXT: retq 445 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 446 ret <4 x double> %s 447} 448 449define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) { 450; ALL-LABEL: shuffle_v4f64_01zz: 451; ALL: # %bb.0: 452; ALL-NEXT: vmovaps %xmm0, %xmm0 453; ALL-NEXT: retq 454 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 455 ret <4 x double> %s 456} 457define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize { 458; ALL-LABEL: shuffle_v4f64_01zz_optsize: 459; ALL: # %bb.0: 460; ALL-NEXT: vmovaps %xmm0, %xmm0 461; ALL-NEXT: retq 462 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 463 ret <4 x double> %s 464} 465 466define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) { 467; ALL-LABEL: shuffle_v4f64_23zz: 468; ALL: # %bb.0: 469; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 470; ALL-NEXT: retq 471 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 472 ret <4 x double> %s 473} 474define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize { 475; ALL-LABEL: shuffle_v4f64_23zz_optsize: 476; ALL: # %bb.0: 477; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 478; ALL-NEXT: retq 479 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 480 ret <4 x double> %s 481} 482 483define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) { 484; ALL-LABEL: shuffle_v4f64_45zz: 485; ALL: # %bb.0: 486; ALL-NEXT: vmovaps %xmm0, %xmm0 487; ALL-NEXT: retq 488 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 489 ret <4 x double> %s 490} 491define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize { 492; ALL-LABEL: shuffle_v4f64_45zz_optsize: 493; ALL: # %bb.0: 494; ALL-NEXT: vmovaps %xmm0, %xmm0 495; ALL-NEXT: retq 496 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 497 ret <4 x double> %s 498} 499 500define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) { 501; ALL-LABEL: shuffle_v4f64_67zz: 502; ALL: # %bb.0: 503; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 504; ALL-NEXT: retq 505 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 506 ret <4 x double> %s 507} 508define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize { 509; ALL-LABEL: shuffle_v4f64_67zz_optsize: 510; ALL: # %bb.0: 511; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 512; ALL-NEXT: retq 513 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 514 ret <4 x double> %s 515} 516 517;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection. 518 519define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) { 520; AVX1-LABEL: shuffle_v4i64_67zz: 521; AVX1: # %bb.0: 522; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 523; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 524; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 525; AVX1-NEXT: retq 526; 527; AVX2-LABEL: shuffle_v4i64_67zz: 528; AVX2: # %bb.0: 529; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 530; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 531; AVX2-NEXT: retq 532 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 533 %c = add <4 x i64> %b, %s 534 ret <4 x i64> %c 535} 536 537;;; Memory folding cases 538 539define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp { 540; AVX1-LABEL: ld0_hi0_lo1_4f64: 541; AVX1: # %bb.0: # %entry 542; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 543; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 544; AVX1-NEXT: retq 545; 546; AVX2-LABEL: ld0_hi0_lo1_4f64: 547; AVX2: # %bb.0: # %entry 548; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 549; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 550; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 551; AVX2-NEXT: retq 552entry: 553 %a = load <4 x double>, <4 x double> * %pa 554 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 555 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 556 ret <4 x double> %res 557} 558 559define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp { 560; AVX1-LABEL: ld1_hi0_hi1_4f64: 561; AVX1: # %bb.0: # %entry 562; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 563; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 564; AVX1-NEXT: retq 565; 566; AVX2-LABEL: ld1_hi0_hi1_4f64: 567; AVX2: # %bb.0: # %entry 568; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 569; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 570; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 571; AVX2-NEXT: retq 572entry: 573 %b = load <4 x double>, <4 x double> * %pb 574 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 575 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 576 ret <4 x double> %res 577} 578 579define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp { 580; AVX1-LABEL: ld0_hi0_lo1_8f32: 581; AVX1: # %bb.0: # %entry 582; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 583; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 584; AVX1-NEXT: retq 585; 586; AVX2-LABEL: ld0_hi0_lo1_8f32: 587; AVX2: # %bb.0: # %entry 588; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 589; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 590; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 591; AVX2-NEXT: retq 592entry: 593 %a = load <8 x float>, <8 x float> * %pa 594 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 595 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 596 ret <8 x float> %res 597} 598 599define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp { 600; AVX1-LABEL: ld1_hi0_hi1_8f32: 601; AVX1: # %bb.0: # %entry 602; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 603; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 604; AVX1-NEXT: retq 605; 606; AVX2-LABEL: ld1_hi0_hi1_8f32: 607; AVX2: # %bb.0: # %entry 608; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 609; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 610; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 611; AVX2-NEXT: retq 612entry: 613 %b = load <8 x float>, <8 x float> * %pb 614 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 615 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 616 ret <8 x float> %res 617} 618 619define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp { 620; AVX1-LABEL: ld0_hi0_lo1_4i64: 621; AVX1: # %bb.0: # %entry 622; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 623; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 624; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 625; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 626; AVX1-NEXT: retq 627; 628; AVX2-LABEL: ld0_hi0_lo1_4i64: 629; AVX2: # %bb.0: # %entry 630; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 631; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 632; AVX2-NEXT: retq 633entry: 634 %a = load <4 x i64>, <4 x i64> * %pa 635 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 636 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 637 ret <4 x i64> %res 638} 639 640define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp { 641; AVX1-LABEL: ld1_hi0_hi1_4i64: 642; AVX1: # %bb.0: # %entry 643; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 644; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 645; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 646; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 647; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 648; AVX1-NEXT: retq 649; 650; AVX2-LABEL: ld1_hi0_hi1_4i64: 651; AVX2: # %bb.0: # %entry 652; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 653; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 654; AVX2-NEXT: retq 655entry: 656 %b = load <4 x i64>, <4 x i64> * %pb 657 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 658 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 659 ret <4 x i64> %res 660} 661 662define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp { 663; AVX1-LABEL: ld0_hi0_lo1_8i32: 664; AVX1: # %bb.0: # %entry 665; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4] 666; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 667; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm1 668; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 669; AVX1-NEXT: retq 670; 671; AVX2-LABEL: ld0_hi0_lo1_8i32: 672; AVX2: # %bb.0: # %entry 673; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 674; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 675; AVX2-NEXT: retq 676entry: 677 %a = load <8 x i32>, <8 x i32> * %pa 678 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 679 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 680 ret <8 x i32> %res 681} 682 683define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp { 684; AVX1-LABEL: ld1_hi0_hi1_8i32: 685; AVX1: # %bb.0: # %entry 686; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4] 687; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm2 688; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 689; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 690; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 691; AVX1-NEXT: retq 692; 693; AVX2-LABEL: ld1_hi0_hi1_8i32: 694; AVX2: # %bb.0: # %entry 695; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 696; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 697; AVX2-NEXT: retq 698entry: 699 %b = load <8 x i32>, <8 x i32> * %pb 700 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 701 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 702 ret <8 x i32> %res 703} 704 705!llvm.module.flags = !{!0} 706!0 = !{i32 1, !"ProfileSummary", !1} 707!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} 708!2 = !{!"ProfileFormat", !"InstrProf"} 709!3 = !{!"TotalCount", i64 10000} 710!4 = !{!"MaxCount", i64 10} 711!5 = !{!"MaxInternalCount", i64 1} 712!6 = !{!"MaxFunctionCount", i64 1000} 713!7 = !{!"NumCounts", i64 3} 714!8 = !{!"NumFunctions", i64 3} 715!9 = !{!"DetailedSummary", !10} 716!10 = !{!11, !12, !13} 717!11 = !{i32 10000, i64 100, i32 1} 718!12 = !{i32 999000, i64 100, i32 1} 719!13 = !{i32 999999, i64 1, i32 2} 720!14 = !{!"function_entry_count", i64 0} 721