1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle 2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s 3 4; FIXME: All cases here should be fixed by PR34380 5 6define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { 7; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: 8; CHECK: # %bb.0: 9; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8] 10; CHECK-NEXT: # ymm1 = mem[0,1,0,1] 11; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 12; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 13; CHECK-NEXT: vzeroupper 14; CHECK-NEXT: retq 15 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> 16 ret <8 x i16> %res 17} 18define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 19; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: 20; CHECK: # %bb.0: 21; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8] 22; CHECK-NEXT: # ymm3 = mem[0,1,0,1] 23; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 24; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 25; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 26; CHECK-NEXT: vzeroupper 27; CHECK-NEXT: retq 28 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> 29 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 30 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 31 ret <8 x i16> %res 32} 33 34define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { 35; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: 36; CHECK: # %bb.0: 37; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8] 38; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 39; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} 40; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 41; CHECK-NEXT: vzeroupper 42; CHECK-NEXT: retq 43 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> 44 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 45 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 46 ret <8 x i16> %res 47} 48define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 49; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: 50; CHECK: # %bb.0: 51; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,9,4,14,15,12,14,4,12,9,4,14,15,12,14] 52; CHECK-NEXT: # ymm3 = mem[0,1,0,1] 53; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 54; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 55; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 56; CHECK-NEXT: vzeroupper 57; CHECK-NEXT: retq 58 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> 59 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 60 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 61 ret <8 x i16> %res 62} 63 64define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) { 65; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1: 66; CHECK: # %bb.0: 67; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14] 68; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 69; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} 70; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 71; CHECK-NEXT: vzeroupper 72; CHECK-NEXT: retq 73 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> 74 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 75 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 76 ret <8 x i16> %res 77} 78define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 79; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2: 80; CHECK: # %bb.0: 81; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,4,11,14,10,7,1,6,9] 82; CHECK-NEXT: # ymm3 = mem[0,1,0,1] 83; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 84; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 85; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 86; CHECK-NEXT: vzeroupper 87; CHECK-NEXT: retq 88 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9> 89 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 90 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 91 ret <8 x i16> %res 92} 93 94define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) { 95; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2: 96; CHECK: # %bb.0: 97; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9] 98; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 99; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} 100; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 101; CHECK-NEXT: vzeroupper 102; CHECK-NEXT: retq 103 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9> 104 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 105 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 106 ret <8 x i16> %res 107} 108define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) { 109; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3: 110; CHECK: # %bb.0: 111; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0] 112; CHECK-NEXT: # ymm1 = mem[0,1,0,1] 113; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 114; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 115; CHECK-NEXT: vzeroupper 116; CHECK-NEXT: retq 117 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> 118 ret <8 x i16> %res 119} 120define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 121; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3: 122; CHECK: # %bb.0: 123; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0] 124; CHECK-NEXT: # ymm3 = mem[0,1,0,1] 125; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 126; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 127; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 128; CHECK-NEXT: vzeroupper 129; CHECK-NEXT: retq 130 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> 131 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 132 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 133 ret <8 x i16> %res 134} 135 136define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) { 137; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3: 138; CHECK: # %bb.0: 139; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0] 140; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 141; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} 142; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 143; CHECK-NEXT: vzeroupper 144; CHECK-NEXT: retq 145 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> 146 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 147 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 148 ret <8 x i16> %res 149} 150define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) { 151; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: 152; CHECK: # %bb.0: 153; CHECK-NEXT: vmovdqa (%rdi), %xmm1 154; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] 155; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 156; CHECK-NEXT: retq 157 %vec = load <16 x i16>, <16 x i16>* %vp 158 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> 159 ret <8 x i16> %res 160} 161define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { 162; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: 163; CHECK: # %bb.0: 164; CHECK-NEXT: vmovdqa (%rdi), %xmm2 165; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9] 166; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 167; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 168; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 169; CHECK-NEXT: retq 170 %vec = load <16 x i16>, <16 x i16>* %vp 171 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> 172 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 173 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 174 ret <8 x i16> %res 175} 176 177define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) { 178; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: 179; CHECK: # %bb.0: 180; CHECK-NEXT: vmovdqa (%rdi), %xmm2 181; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] 182; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 183; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} 184; CHECK-NEXT: vmovdqa %xmm1, %xmm0 185; CHECK-NEXT: retq 186 %vec = load <16 x i16>, <16 x i16>* %vp 187 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> 188 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 189 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 190 ret <8 x i16> %res 191} 192 193define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { 194; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: 195; CHECK: # %bb.0: 196; CHECK-NEXT: vmovdqa (%rdi), %xmm2 197; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14] 198; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 199; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 200; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 201; CHECK-NEXT: retq 202 %vec = load <16 x i16>, <16 x i16>* %vp 203 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14> 204 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 205 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 206 ret <8 x i16> %res 207} 208 209define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) { 210; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: 211; CHECK: # %bb.0: 212; CHECK-NEXT: vmovdqa (%rdi), %xmm2 213; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] 214; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 215; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} 216; CHECK-NEXT: vmovdqa %xmm1, %xmm0 217; CHECK-NEXT: retq 218 %vec = load <16 x i16>, <16 x i16>* %vp 219 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14> 220 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 221 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 222 ret <8 x i16> %res 223} 224 225define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { 226; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: 227; CHECK: # %bb.0: 228; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 229; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] 230; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 231; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 232; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 233; CHECK-NEXT: retq 234 %vec = load <16 x i16>, <16 x i16>* %vp 235 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9> 236 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 237 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 238 ret <8 x i16> %res 239} 240 241define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) { 242; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: 243; CHECK: # %bb.0: 244; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 245; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] 246; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 247; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} 248; CHECK-NEXT: vmovdqa %xmm1, %xmm0 249; CHECK-NEXT: retq 250 %vec = load <16 x i16>, <16 x i16>* %vp 251 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9> 252 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 253 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 254 ret <8 x i16> %res 255} 256 257define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) { 258; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: 259; CHECK: # %bb.0: 260; CHECK-NEXT: vmovdqa (%rdi), %xmm1 261; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] 262; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 263; CHECK-NEXT: retq 264 %vec = load <16 x i16>, <16 x i16>* %vp 265 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> 266 ret <8 x i16> %res 267} 268define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { 269; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: 270; CHECK: # %bb.0: 271; CHECK-NEXT: vmovdqa (%rdi), %xmm2 272; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2] 273; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 274; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 275; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 276; CHECK-NEXT: retq 277 %vec = load <16 x i16>, <16 x i16>* %vp 278 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> 279 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 280 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 281 ret <8 x i16> %res 282} 283 284define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) { 285; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: 286; CHECK: # %bb.0: 287; CHECK-NEXT: vmovdqa (%rdi), %xmm2 288; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] 289; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 290; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} 291; CHECK-NEXT: vmovdqa %xmm1, %xmm0 292; CHECK-NEXT: retq 293 %vec = load <16 x i16>, <16 x i16>* %vp 294 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> 295 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 296 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 297 ret <8 x i16> %res 298} 299 300define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { 301; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0: 302; CHECK: # %bb.0: 303; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 304; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] 305; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 306; CHECK-NEXT: vmovdqa %ymm1, %ymm0 307; CHECK-NEXT: retq 308 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> 309 ret <16 x i16> %res 310} 311define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { 312; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0: 313; CHECK: # %bb.0: 314; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 315; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] 316; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 317; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 318; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} 319; CHECK-NEXT: retq 320 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> 321 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 322 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 323 ret <16 x i16> %res 324} 325 326define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) { 327; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0: 328; CHECK: # %bb.0: 329; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 330; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] 331; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 332; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} 333; CHECK-NEXT: vmovdqa %ymm2, %ymm0 334; CHECK-NEXT: retq 335 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> 336 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 337 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 338 ret <16 x i16> %res 339} 340define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { 341; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1: 342; CHECK: # %bb.0: 343; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 344; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] 345; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 346; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 347; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} 348; CHECK-NEXT: retq 349 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> 350 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 351 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 352 ret <16 x i16> %res 353} 354 355define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) { 356; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1: 357; CHECK: # %bb.0: 358; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 359; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] 360; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 361; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} 362; CHECK-NEXT: vmovdqa %ymm2, %ymm0 363; CHECK-NEXT: retq 364 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> 365 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 366 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 367 ret <16 x i16> %res 368} 369define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { 370; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2: 371; CHECK: # %bb.0: 372; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 373; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] 374; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 375; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 376; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} 377; CHECK-NEXT: retq 378 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> 379 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 380 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 381 ret <16 x i16> %res 382} 383 384define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) { 385; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2: 386; CHECK: # %bb.0: 387; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 388; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] 389; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 390; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} 391; CHECK-NEXT: vmovdqa %ymm2, %ymm0 392; CHECK-NEXT: retq 393 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> 394 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 395 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 396 ret <16 x i16> %res 397} 398define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) { 399; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3: 400; CHECK: # %bb.0: 401; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 402; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] 403; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 404; CHECK-NEXT: vmovdqa %ymm1, %ymm0 405; CHECK-NEXT: retq 406 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> 407 ret <16 x i16> %res 408} 409define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { 410; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3: 411; CHECK: # %bb.0: 412; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 413; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] 414; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 415; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 416; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} 417; CHECK-NEXT: retq 418 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> 419 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 420 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 421 ret <16 x i16> %res 422} 423 424define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) { 425; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3: 426; CHECK: # %bb.0: 427; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 428; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] 429; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 430; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z} 431; CHECK-NEXT: vmovdqa %ymm2, %ymm0 432; CHECK-NEXT: retq 433 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> 434 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 435 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 436 ret <16 x i16> %res 437} 438define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { 439; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0: 440; CHECK: # %bb.0: 441; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] 442; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 443; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 444; CHECK-NEXT: vmovdqa %xmm1, %xmm0 445; CHECK-NEXT: vzeroupper 446; CHECK-NEXT: retq 447 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> 448 ret <8 x i16> %res 449} 450define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 451; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0: 452; CHECK: # %bb.0: 453; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] 454; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 455; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4 456; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 457; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} 458; CHECK-NEXT: vzeroupper 459; CHECK-NEXT: retq 460 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> 461 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 462 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 463 ret <8 x i16> %res 464} 465 466define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) { 467; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0: 468; CHECK: # %bb.0: 469; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] 470; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 471; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 472; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z} 473; CHECK-NEXT: vmovdqa %xmm2, %xmm0 474; CHECK-NEXT: vzeroupper 475; CHECK-NEXT: retq 476 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> 477 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 478 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 479 ret <8 x i16> %res 480} 481define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 482; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1: 483; CHECK: # %bb.0: 484; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5] 485; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 486; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 487; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 488; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 489; CHECK-NEXT: vzeroupper 490; CHECK-NEXT: retq 491 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5> 492 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 493 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 494 ret <8 x i16> %res 495} 496 497define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) { 498; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1: 499; CHECK: # %bb.0: 500; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5] 501; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 502; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 503; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} 504; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 505; CHECK-NEXT: vzeroupper 506; CHECK-NEXT: retq 507 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5> 508 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 509 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 510 ret <8 x i16> %res 511} 512define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 513; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2: 514; CHECK: # %bb.0: 515; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8] 516; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 517; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 518; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 519; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 520; CHECK-NEXT: vzeroupper 521; CHECK-NEXT: retq 522 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8> 523 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 524 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 525 ret <8 x i16> %res 526} 527 528define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) { 529; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2: 530; CHECK: # %bb.0: 531; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8] 532; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 533; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 534; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} 535; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 536; CHECK-NEXT: vzeroupper 537; CHECK-NEXT: retq 538 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8> 539 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 540 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 541 ret <8 x i16> %res 542} 543define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) { 544; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3: 545; CHECK: # %bb.0: 546; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30] 547; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 548; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 549; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 550; CHECK-NEXT: vzeroupper 551; CHECK-NEXT: retq 552 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> 553 ret <8 x i16> %res 554} 555define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 556; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3: 557; CHECK: # %bb.0: 558; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30] 559; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 560; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 561; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 562; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} 563; CHECK-NEXT: vzeroupper 564; CHECK-NEXT: retq 565 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> 566 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 567 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 568 ret <8 x i16> %res 569} 570 571define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) { 572; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3: 573; CHECK: # %bb.0: 574; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30] 575; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 576; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 577; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} 578; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 579; CHECK-NEXT: vzeroupper 580; CHECK-NEXT: retq 581 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> 582 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 583 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 584 ret <8 x i16> %res 585} 586define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) { 587; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0: 588; CHECK: # %bb.0: 589; CHECK-NEXT: vmovdqa (%rdi), %ymm1 590; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] 591; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 592; CHECK-NEXT: retq 593 %vec = load <32 x i16>, <32 x i16>* %vp 594 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> 595 ret <16 x i16> %res 596} 597define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { 598; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0: 599; CHECK: # %bb.0: 600; CHECK-NEXT: vmovdqa (%rdi), %ymm2 601; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] 602; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 603; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 604; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} 605; CHECK-NEXT: retq 606 %vec = load <32 x i16>, <32 x i16>* %vp 607 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> 608 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 609 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 610 ret <16 x i16> %res 611} 612 613define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) { 614; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0: 615; CHECK: # %bb.0: 616; CHECK-NEXT: vmovdqa (%rdi), %ymm2 617; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] 618; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 619; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} 620; CHECK-NEXT: vmovdqa %ymm1, %ymm0 621; CHECK-NEXT: retq 622 %vec = load <32 x i16>, <32 x i16>* %vp 623 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> 624 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 625 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 626 ret <16 x i16> %res 627} 628 629define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { 630; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1: 631; CHECK: # %bb.0: 632; CHECK-NEXT: vmovdqa (%rdi), %ymm2 633; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] 634; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 635; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 636; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} 637; CHECK-NEXT: retq 638 %vec = load <32 x i16>, <32 x i16>* %vp 639 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25> 640 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 641 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 642 ret <16 x i16> %res 643} 644 645define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) { 646; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1: 647; CHECK: # %bb.0: 648; CHECK-NEXT: vmovdqa (%rdi), %ymm2 649; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] 650; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 651; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} 652; CHECK-NEXT: vmovdqa %ymm1, %ymm0 653; CHECK-NEXT: retq 654 %vec = load <32 x i16>, <32 x i16>* %vp 655 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25> 656 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 657 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 658 ret <16 x i16> %res 659} 660 661define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { 662; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2: 663; CHECK: # %bb.0: 664; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 665; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] 666; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 667; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 668; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} 669; CHECK-NEXT: retq 670 %vec = load <32 x i16>, <32 x i16>* %vp 671 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16> 672 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 673 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 674 ret <16 x i16> %res 675} 676 677define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) { 678; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2: 679; CHECK: # %bb.0: 680; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 681; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] 682; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 683; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z} 684; CHECK-NEXT: vmovdqa %ymm1, %ymm0 685; CHECK-NEXT: retq 686 %vec = load <32 x i16>, <32 x i16>* %vp 687 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16> 688 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 689 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 690 ret <16 x i16> %res 691} 692 693define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) { 694; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3: 695; CHECK: # %bb.0: 696; CHECK-NEXT: vmovdqa (%rdi), %ymm1 697; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] 698; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 699; CHECK-NEXT: retq 700 %vec = load <32 x i16>, <32 x i16>* %vp 701 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> 702 ret <16 x i16> %res 703} 704define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { 705; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3: 706; CHECK: # %bb.0: 707; CHECK-NEXT: vmovdqa (%rdi), %ymm2 708; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] 709; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 710; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 711; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} 712; CHECK-NEXT: retq 713 %vec = load <32 x i16>, <32 x i16>* %vp 714 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> 715 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 716 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 717 ret <16 x i16> %res 718} 719 720define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) { 721; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3: 722; CHECK: # %bb.0: 723; CHECK-NEXT: vmovdqa (%rdi), %ymm2 724; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] 725; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 726; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} 727; CHECK-NEXT: vmovdqa %ymm1, %ymm0 728; CHECK-NEXT: retq 729 %vec = load <32 x i16>, <32 x i16>* %vp 730 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> 731 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 732 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 733 ret <16 x i16> %res 734} 735 736define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) { 737; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: 738; CHECK: # %bb.0: 739; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] 740; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 741; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0 742; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 743; CHECK-NEXT: vzeroupper 744; CHECK-NEXT: retq 745 %vec = load <32 x i16>, <32 x i16>* %vp 746 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> 747 ret <8 x i16> %res 748} 749define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { 750; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: 751; CHECK: # %bb.0: 752; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] 753; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 754; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 755; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 756; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 757; CHECK-NEXT: vzeroupper 758; CHECK-NEXT: retq 759 %vec = load <32 x i16>, <32 x i16>* %vp 760 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> 761 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 762 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 763 ret <8 x i16> %res 764} 765 766define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) { 767; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: 768; CHECK: # %bb.0: 769; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] 770; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 771; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 772; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} 773; CHECK-NEXT: vmovdqa %xmm1, %xmm0 774; CHECK-NEXT: vzeroupper 775; CHECK-NEXT: retq 776 %vec = load <32 x i16>, <32 x i16>* %vp 777 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> 778 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 779 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 780 ret <8 x i16> %res 781} 782 783define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { 784; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: 785; CHECK: # %bb.0: 786; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] 787; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 788; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 789; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 790; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 791; CHECK-NEXT: vzeroupper 792; CHECK-NEXT: retq 793 %vec = load <32 x i16>, <32 x i16>* %vp 794 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17> 795 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 796 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 797 ret <8 x i16> %res 798} 799 800define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) { 801; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: 802; CHECK: # %bb.0: 803; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] 804; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 805; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 806; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} 807; CHECK-NEXT: vmovdqa %xmm1, %xmm0 808; CHECK-NEXT: vzeroupper 809; CHECK-NEXT: retq 810 %vec = load <32 x i16>, <32 x i16>* %vp 811 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17> 812 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 813 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 814 ret <8 x i16> %res 815} 816 817define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { 818; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: 819; CHECK: # %bb.0: 820; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] 821; CHECK-NEXT: vmovdqa (%rdi), %ymm3 822; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 823; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 824; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 825; CHECK-NEXT: vzeroupper 826; CHECK-NEXT: retq 827 %vec = load <32 x i16>, <32 x i16>* %vp 828 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10> 829 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 830 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 831 ret <8 x i16> %res 832} 833 834define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) { 835; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: 836; CHECK: # %bb.0: 837; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] 838; CHECK-NEXT: vmovdqa (%rdi), %ymm1 839; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 840; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} 841; CHECK-NEXT: vmovdqa %xmm1, %xmm0 842; CHECK-NEXT: vzeroupper 843; CHECK-NEXT: retq 844 %vec = load <32 x i16>, <32 x i16>* %vp 845 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10> 846 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 847 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 848 ret <8 x i16> %res 849} 850 851define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) { 852; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: 853; CHECK: # %bb.0: 854; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] 855; CHECK-NEXT: vmovdqa (%rdi), %ymm0 856; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0 857; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 858; CHECK-NEXT: vzeroupper 859; CHECK-NEXT: retq 860 %vec = load <32 x i16>, <32 x i16>* %vp 861 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> 862 ret <8 x i16> %res 863} 864define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { 865; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: 866; CHECK: # %bb.0: 867; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] 868; CHECK-NEXT: vmovdqa (%rdi), %ymm3 869; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 870; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 871; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 872; CHECK-NEXT: vzeroupper 873; CHECK-NEXT: retq 874 %vec = load <32 x i16>, <32 x i16>* %vp 875 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> 876 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 877 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 878 ret <8 x i16> %res 879} 880 881define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) { 882; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: 883; CHECK: # %bb.0: 884; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] 885; CHECK-NEXT: vmovdqa (%rdi), %ymm1 886; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 887; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} 888; CHECK-NEXT: vmovdqa %xmm1, %xmm0 889; CHECK-NEXT: vzeroupper 890; CHECK-NEXT: retq 891 %vec = load <32 x i16>, <32 x i16>* %vp 892 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> 893 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 894 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 895 ret <8 x i16> %res 896} 897 898define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) { 899; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF: 900; CHECK: # %bb.0: 901; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,8,4,12,9,4,14,15,14,8,4,12,9,4,14,15] 902; CHECK-NEXT: # ymm1 = mem[0,1,0,1] 903; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 904; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 905; CHECK-NEXT: vzeroupper 906; CHECK-NEXT: retq 907 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15> 908 ret <8 x i16> %res 909} 910 911define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) { 912; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0: 913; CHECK: # %bb.0: 914; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2] 915; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 916; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 917; CHECK-NEXT: vzeroupper 918; CHECK-NEXT: retq 919 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> 920 ret <4 x i32> %res 921} 922define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 923; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0: 924; CHECK: # %bb.0: 925; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2] 926; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 927; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 928; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 929; CHECK-NEXT: vzeroupper 930; CHECK-NEXT: retq 931 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> 932 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 933 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 934 ret <4 x i32> %res 935} 936 937define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) { 938; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0: 939; CHECK: # %bb.0: 940; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2] 941; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 942; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} 943; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 944; CHECK-NEXT: vzeroupper 945; CHECK-NEXT: retq 946 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> 947 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 948 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 949 ret <4 x i32> %res 950} 951define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 952; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1: 953; CHECK: # %bb.0: 954; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3] 955; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 956; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 957; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 958; CHECK-NEXT: vzeroupper 959; CHECK-NEXT: retq 960 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3> 961 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 962 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 963 ret <4 x i32> %res 964} 965 966define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) { 967; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1: 968; CHECK: # %bb.0: 969; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3] 970; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 971; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} 972; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 973; CHECK-NEXT: vzeroupper 974; CHECK-NEXT: retq 975 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3> 976 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 977 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 978 ret <4 x i32> %res 979} 980define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 981; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2: 982; CHECK: # %bb.0: 983; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 984; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1] 985; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 986; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 987; CHECK-NEXT: vzeroupper 988; CHECK-NEXT: retq 989 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 990 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 991 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 992 ret <4 x i32> %res 993} 994 995define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) { 996; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2: 997; CHECK: # %bb.0: 998; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 999; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1] 1000; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1001; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1002; CHECK-NEXT: vzeroupper 1003; CHECK-NEXT: retq 1004 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 1005 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1006 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1007 ret <4 x i32> %res 1008} 1009define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) { 1010; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3: 1011; CHECK: # %bb.0: 1012; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5] 1013; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 1014; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1015; CHECK-NEXT: vzeroupper 1016; CHECK-NEXT: retq 1017 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> 1018 ret <4 x i32> %res 1019} 1020define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1021; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3: 1022; CHECK: # %bb.0: 1023; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5] 1024; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 1025; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1026; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 1027; CHECK-NEXT: vzeroupper 1028; CHECK-NEXT: retq 1029 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> 1030 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1031 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1032 ret <4 x i32> %res 1033} 1034 1035define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) { 1036; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3: 1037; CHECK: # %bb.0: 1038; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5] 1039; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1040; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} 1041; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1042; CHECK-NEXT: vzeroupper 1043; CHECK-NEXT: retq 1044 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> 1045 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1046 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1047 ret <4 x i32> %res 1048} 1049define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) { 1050; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0: 1051; CHECK: # %bb.0: 1052; CHECK-NEXT: vmovaps 16(%rdi), %xmm0 1053; CHECK-NEXT: vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0] 1054; CHECK-NEXT: retq 1055 %vec = load <8 x i32>, <8 x i32>* %vp 1056 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> 1057 ret <4 x i32> %res 1058} 1059define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1060; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0: 1061; CHECK: # %bb.0: 1062; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 1063; CHECK-NEXT: vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0] 1064; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1065; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} 1066; CHECK-NEXT: retq 1067 %vec = load <8 x i32>, <8 x i32>* %vp 1068 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> 1069 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1070 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1071 ret <4 x i32> %res 1072} 1073 1074define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) { 1075; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0: 1076; CHECK: # %bb.0: 1077; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 1078; CHECK-NEXT: vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0] 1079; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1080; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} 1081; CHECK-NEXT: retq 1082 %vec = load <8 x i32>, <8 x i32>* %vp 1083 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> 1084 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1085 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1086 ret <4 x i32> %res 1087} 1088 1089define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1090; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: 1091; CHECK: # %bb.0: 1092; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1093; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3] 1094; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 1095; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1096; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1097; CHECK-NEXT: retq 1098 %vec = load <8 x i32>, <8 x i32>* %vp 1099 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3> 1100 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1101 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1102 ret <4 x i32> %res 1103} 1104 1105define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) { 1106; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: 1107; CHECK: # %bb.0: 1108; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1109; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3] 1110; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1111; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} 1112; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1113; CHECK-NEXT: retq 1114 %vec = load <8 x i32>, <8 x i32>* %vp 1115 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3> 1116 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1117 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1118 ret <4 x i32> %res 1119} 1120 1121define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1122; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: 1123; CHECK: # %bb.0: 1124; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1125; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0] 1126; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 1127; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1128; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1129; CHECK-NEXT: retq 1130 %vec = load <8 x i32>, <8 x i32>* %vp 1131 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4> 1132 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1133 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1134 ret <4 x i32> %res 1135} 1136 1137define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) { 1138; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: 1139; CHECK: # %bb.0: 1140; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1141; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0] 1142; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1143; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} 1144; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1145; CHECK-NEXT: retq 1146 %vec = load <8 x i32>, <8 x i32>* %vp 1147 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4> 1148 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1149 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1150 ret <4 x i32> %res 1151} 1152 1153define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) { 1154; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3: 1155; CHECK: # %bb.0: 1156; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1 1157; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7] 1158; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0 1159; CHECK-NEXT: retq 1160 %vec = load <8 x i32>, <8 x i32>* %vp 1161 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> 1162 ret <4 x i32> %res 1163} 1164define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1165; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3: 1166; CHECK: # %bb.0: 1167; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 1168; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7] 1169; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 1170; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1171; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1172; CHECK-NEXT: retq 1173 %vec = load <8 x i32>, <8 x i32>* %vp 1174 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> 1175 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1176 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1177 ret <4 x i32> %res 1178} 1179 1180define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) { 1181; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3: 1182; CHECK: # %bb.0: 1183; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 1184; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7] 1185; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1186; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} 1187; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1188; CHECK-NEXT: retq 1189 %vec = load <8 x i32>, <8 x i32>* %vp 1190 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> 1191 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1192 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1193 ret <4 x i32> %res 1194} 1195 1196define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { 1197; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0: 1198; CHECK: # %bb.0: 1199; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6] 1200; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1201; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 1202; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1203; CHECK-NEXT: retq 1204 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> 1205 ret <8 x i32> %res 1206} 1207define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { 1208; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0: 1209; CHECK: # %bb.0: 1210; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6] 1211; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 1212; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 1213; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 1214; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} 1215; CHECK-NEXT: retq 1216 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> 1217 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1218 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1219 ret <8 x i32> %res 1220} 1221 1222define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) { 1223; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0: 1224; CHECK: # %bb.0: 1225; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6] 1226; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1227; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1228; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1229; CHECK-NEXT: retq 1230 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> 1231 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1232 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1233 ret <8 x i32> %res 1234} 1235define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { 1236; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1: 1237; CHECK: # %bb.0: 1238; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,15,3,2,3,6,8,3,0,15,3,2,3,6,8] 1239; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 1240; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 1241; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 1242; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} 1243; CHECK-NEXT: retq 1244 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8> 1245 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1246 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1247 ret <8 x i32> %res 1248} 1249 1250define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) { 1251; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1: 1252; CHECK: # %bb.0: 1253; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] 1254; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1255; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1256; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1257; CHECK-NEXT: retq 1258 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8> 1259 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1260 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1261 ret <8 x i32> %res 1262} 1263define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { 1264; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2: 1265; CHECK: # %bb.0: 1266; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,15,15,2,6,10,14,7,2,15,15,2,6,10,14,7] 1267; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 1268; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 1269; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 1270; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} 1271; CHECK-NEXT: retq 1272 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7> 1273 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1274 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1275 ret <8 x i32> %res 1276} 1277 1278define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) { 1279; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2: 1280; CHECK: # %bb.0: 1281; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] 1282; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1283; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1284; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1285; CHECK-NEXT: retq 1286 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7> 1287 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1288 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1289 ret <8 x i32> %res 1290} 1291define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { 1292; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3: 1293; CHECK: # %bb.0: 1294; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3] 1295; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 1296; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 1297; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1298; CHECK-NEXT: retq 1299 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> 1300 ret <8 x i32> %res 1301} 1302define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { 1303; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3: 1304; CHECK: # %bb.0: 1305; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3] 1306; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 1307; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 1308; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 1309; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} 1310; CHECK-NEXT: retq 1311 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> 1312 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1313 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1314 ret <8 x i32> %res 1315} 1316 1317define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) { 1318; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3: 1319; CHECK: # %bb.0: 1320; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] 1321; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1322; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1323; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1324; CHECK-NEXT: retq 1325 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> 1326 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1327 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1328 ret <8 x i32> %res 1329} 1330define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { 1331; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: 1332; CHECK: # %bb.0: 1333; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] 1334; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 1335; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1336; CHECK-NEXT: vzeroupper 1337; CHECK-NEXT: retq 1338 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> 1339 ret <4 x i32> %res 1340} 1341define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1342; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: 1343; CHECK: # %bb.0: 1344; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] 1345; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 1346; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1347; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 1348; CHECK-NEXT: vzeroupper 1349; CHECK-NEXT: retq 1350 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> 1351 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1352 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1353 ret <4 x i32> %res 1354} 1355 1356define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { 1357; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: 1358; CHECK: # %bb.0: 1359; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] 1360; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1361; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1362; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1363; CHECK-NEXT: vzeroupper 1364; CHECK-NEXT: retq 1365 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> 1366 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1367 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1368 ret <4 x i32> %res 1369} 1370define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1371; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1: 1372; CHECK: # %bb.0: 1373; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4] 1374; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1375; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 1376; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1377; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 1378; CHECK-NEXT: vzeroupper 1379; CHECK-NEXT: retq 1380 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12> 1381 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1382 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1383 ret <4 x i32> %res 1384} 1385 1386define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) { 1387; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1: 1388; CHECK: # %bb.0: 1389; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4] 1390; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1391; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1392; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} 1393; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1394; CHECK-NEXT: vzeroupper 1395; CHECK-NEXT: retq 1396 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12> 1397 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1398 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1399 ret <4 x i32> %res 1400} 1401define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1402; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2: 1403; CHECK: # %bb.0: 1404; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0] 1405; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 1406; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1407; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 1408; CHECK-NEXT: vzeroupper 1409; CHECK-NEXT: retq 1410 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0> 1411 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1412 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1413 ret <4 x i32> %res 1414} 1415 1416define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) { 1417; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2: 1418; CHECK: # %bb.0: 1419; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0] 1420; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1421; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1422; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1423; CHECK-NEXT: vzeroupper 1424; CHECK-NEXT: retq 1425 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0> 1426 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1427 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1428 ret <4 x i32> %res 1429} 1430define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { 1431; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: 1432; CHECK: # %bb.0: 1433; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13] 1434; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 1435; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1436; CHECK-NEXT: vzeroupper 1437; CHECK-NEXT: retq 1438 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> 1439 ret <4 x i32> %res 1440} 1441define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1442; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: 1443; CHECK: # %bb.0: 1444; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13] 1445; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 1446; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1447; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 1448; CHECK-NEXT: vzeroupper 1449; CHECK-NEXT: retq 1450 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> 1451 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1452 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1453 ret <4 x i32> %res 1454} 1455 1456define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) { 1457; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: 1458; CHECK: # %bb.0: 1459; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13] 1460; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1461; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1462; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1463; CHECK-NEXT: vzeroupper 1464; CHECK-NEXT: retq 1465 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> 1466 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1467 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1468 ret <4 x i32> %res 1469} 1470define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) { 1471; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0: 1472; CHECK: # %bb.0: 1473; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] 1474; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0 1475; CHECK-NEXT: retq 1476 %vec = load <16 x i32>, <16 x i32>* %vp 1477 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> 1478 ret <8 x i32> %res 1479} 1480define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { 1481; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0: 1482; CHECK: # %bb.0: 1483; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4] 1484; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1485; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1} 1486; CHECK-NEXT: retq 1487 %vec = load <16 x i32>, <16 x i32>* %vp 1488 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> 1489 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1490 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1491 ret <8 x i32> %res 1492} 1493 1494define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) { 1495; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0: 1496; CHECK: # %bb.0: 1497; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4] 1498; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 1499; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z} 1500; CHECK-NEXT: retq 1501 %vec = load <16 x i32>, <16 x i32>* %vp 1502 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> 1503 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1504 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1505 ret <8 x i32> %res 1506} 1507 1508define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { 1509; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1: 1510; CHECK: # %bb.0: 1511; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1512; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15] 1513; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 1514; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1515; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} 1516; CHECK-NEXT: retq 1517 %vec = load <16 x i32>, <16 x i32>* %vp 1518 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7> 1519 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1520 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1521 ret <8 x i32> %res 1522} 1523 1524define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) { 1525; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1: 1526; CHECK: # %bb.0: 1527; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1528; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] 1529; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 1530; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} 1531; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1532; CHECK-NEXT: retq 1533 %vec = load <16 x i32>, <16 x i32>* %vp 1534 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7> 1535 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1536 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1537 ret <8 x i32> %res 1538} 1539 1540define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { 1541; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2: 1542; CHECK: # %bb.0: 1543; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1544; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10] 1545; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 1546; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1547; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} 1548; CHECK-NEXT: retq 1549 %vec = load <16 x i32>, <16 x i32>* %vp 1550 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2> 1551 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1552 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1553 ret <8 x i32> %res 1554} 1555 1556define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) { 1557; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2: 1558; CHECK: # %bb.0: 1559; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1560; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] 1561; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 1562; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} 1563; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1564; CHECK-NEXT: retq 1565 %vec = load <16 x i32>, <16 x i32>* %vp 1566 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2> 1567 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1568 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1569 ret <8 x i32> %res 1570} 1571 1572define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) { 1573; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3: 1574; CHECK: # %bb.0: 1575; CHECK-NEXT: vmovdqa (%rdi), %ymm1 1576; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] 1577; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0 1578; CHECK-NEXT: retq 1579 %vec = load <16 x i32>, <16 x i32>* %vp 1580 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> 1581 ret <8 x i32> %res 1582} 1583define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { 1584; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3: 1585; CHECK: # %bb.0: 1586; CHECK-NEXT: vmovdqa (%rdi), %ymm2 1587; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12] 1588; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 1589; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1590; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} 1591; CHECK-NEXT: retq 1592 %vec = load <16 x i32>, <16 x i32>* %vp 1593 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> 1594 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1595 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1596 ret <8 x i32> %res 1597} 1598 1599define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) { 1600; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3: 1601; CHECK: # %bb.0: 1602; CHECK-NEXT: vmovdqa (%rdi), %ymm2 1603; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] 1604; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 1605; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} 1606; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1607; CHECK-NEXT: retq 1608 %vec = load <16 x i32>, <16 x i32>* %vp 1609 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> 1610 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1611 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1612 ret <8 x i32> %res 1613} 1614 1615define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) { 1616; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: 1617; CHECK: # %bb.0: 1618; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6] 1619; CHECK-NEXT: vmovdqa (%rdi), %ymm0 1620; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0 1621; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1622; CHECK-NEXT: vzeroupper 1623; CHECK-NEXT: retq 1624 %vec = load <16 x i32>, <16 x i32>* %vp 1625 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> 1626 ret <4 x i32> %res 1627} 1628define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1629; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: 1630; CHECK: # %bb.0: 1631; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6] 1632; CHECK-NEXT: vmovdqa (%rdi), %ymm3 1633; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 1634; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1635; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1636; CHECK-NEXT: vzeroupper 1637; CHECK-NEXT: retq 1638 %vec = load <16 x i32>, <16 x i32>* %vp 1639 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> 1640 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1641 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1642 ret <4 x i32> %res 1643} 1644 1645define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) { 1646; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: 1647; CHECK: # %bb.0: 1648; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6] 1649; CHECK-NEXT: vmovdqa (%rdi), %ymm1 1650; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1651; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} 1652; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1653; CHECK-NEXT: vzeroupper 1654; CHECK-NEXT: retq 1655 %vec = load <16 x i32>, <16 x i32>* %vp 1656 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> 1657 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1658 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1659 ret <4 x i32> %res 1660} 1661 1662define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1663; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: 1664; CHECK: # %bb.0: 1665; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1666; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,5,3,2,u,u,u,u> 1667; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 1668; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1669; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1670; CHECK-NEXT: vzeroupper 1671; CHECK-NEXT: retq 1672 %vec = load <16 x i32>, <16 x i32>* %vp 1673 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10> 1674 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1675 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1676 ret <4 x i32> %res 1677} 1678 1679define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) { 1680; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: 1681; CHECK: # %bb.0: 1682; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1683; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <15,5,3,2,u,u,u,u> 1684; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1685; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} 1686; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1687; CHECK-NEXT: vzeroupper 1688; CHECK-NEXT: retq 1689 %vec = load <16 x i32>, <16 x i32>* %vp 1690 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10> 1691 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1692 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1693 ret <4 x i32> %res 1694} 1695 1696define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1697; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: 1698; CHECK: # %bb.0: 1699; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9] 1700; CHECK-NEXT: vmovdqa (%rdi), %ymm3 1701; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 1702; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1703; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1704; CHECK-NEXT: vzeroupper 1705; CHECK-NEXT: retq 1706 %vec = load <16 x i32>, <16 x i32>* %vp 1707 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9> 1708 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1709 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1710 ret <4 x i32> %res 1711} 1712 1713define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) { 1714; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: 1715; CHECK: # %bb.0: 1716; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9] 1717; CHECK-NEXT: vmovdqa (%rdi), %ymm1 1718; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1719; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} 1720; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1721; CHECK-NEXT: vzeroupper 1722; CHECK-NEXT: retq 1723 %vec = load <16 x i32>, <16 x i32>* %vp 1724 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9> 1725 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1726 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1727 ret <4 x i32> %res 1728} 1729 1730define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) { 1731; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: 1732; CHECK: # %bb.0: 1733; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 1734; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,3,6] 1735; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 1736; CHECK-NEXT: retq 1737 %vec = load <16 x i32>, <16 x i32>* %vp 1738 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> 1739 ret <4 x i32> %res 1740} 1741define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1742; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: 1743; CHECK: # %bb.0: 1744; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1745; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,3,6] 1746; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 1747; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1748; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1749; CHECK-NEXT: retq 1750 %vec = load <16 x i32>, <16 x i32>* %vp 1751 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> 1752 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1753 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1754 ret <4 x i32> %res 1755} 1756 1757define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) { 1758; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: 1759; CHECK: # %bb.0: 1760; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1761; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,4,3,6] 1762; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1763; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} 1764; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1765; CHECK-NEXT: retq 1766 %vec = load <16 x i32>, <16 x i32>* %vp 1767 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> 1768 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1769 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1770 ret <4 x i32> %res 1771} 1772 1773define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { 1774; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: 1775; CHECK: # %bb.0: 1776; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] 1777; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 1778; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1779; CHECK-NEXT: vzeroupper 1780; CHECK-NEXT: retq 1781 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10> 1782 ret <4 x i32> %res 1783} 1784 1785define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) { 1786; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0: 1787; CHECK: # %bb.0: 1788; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] 1789; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1790; CHECK-NEXT: vzeroupper 1791; CHECK-NEXT: retq 1792 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> 1793 ret <2 x i64> %res 1794} 1795define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { 1796; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0: 1797; CHECK: # %bb.0: 1798; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3] 1799; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 1800; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} 1801; CHECK-NEXT: vzeroupper 1802; CHECK-NEXT: retq 1803 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> 1804 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1805 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 1806 ret <2 x i64> %res 1807} 1808 1809define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) { 1810; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0: 1811; CHECK: # %bb.0: 1812; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 1813; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3] 1814; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1815; CHECK-NEXT: vzeroupper 1816; CHECK-NEXT: retq 1817 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> 1818 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1819 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 1820 ret <2 x i64> %res 1821} 1822define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { 1823; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1: 1824; CHECK: # %bb.0: 1825; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 1826; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 1827; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} 1828; CHECK-NEXT: vzeroupper 1829; CHECK-NEXT: retq 1830 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> 1831 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1832 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 1833 ret <2 x i64> %res 1834} 1835 1836define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) { 1837; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1: 1838; CHECK: # %bb.0: 1839; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 1840; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3] 1841; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1842; CHECK-NEXT: vzeroupper 1843; CHECK-NEXT: retq 1844 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> 1845 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1846 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 1847 ret <2 x i64> %res 1848} 1849define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) { 1850; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0: 1851; CHECK: # %bb.0: 1852; CHECK-NEXT: vmovaps (%rdi), %xmm0 1853; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1] 1854; CHECK-NEXT: retq 1855 %vec = load <4 x i64>, <4 x i64>* %vp 1856 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> 1857 ret <2 x i64> %res 1858} 1859define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { 1860; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0: 1861; CHECK: # %bb.0: 1862; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1863; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 1864; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1] 1865; CHECK-NEXT: retq 1866 %vec = load <4 x i64>, <4 x i64>* %vp 1867 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> 1868 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1869 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 1870 ret <2 x i64> %res 1871} 1872 1873define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) { 1874; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0: 1875; CHECK: # %bb.0: 1876; CHECK-NEXT: vmovdqa (%rdi), %xmm1 1877; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 1878; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1] 1879; CHECK-NEXT: retq 1880 %vec = load <4 x i64>, <4 x i64>* %vp 1881 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> 1882 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1883 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 1884 ret <2 x i64> %res 1885} 1886 1887define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { 1888; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1: 1889; CHECK: # %bb.0: 1890; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1891; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] 1892; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 1893; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} 1894; CHECK-NEXT: retq 1895 %vec = load <4 x i64>, <4 x i64>* %vp 1896 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> 1897 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1898 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 1899 ret <2 x i64> %res 1900} 1901 1902define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) { 1903; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1: 1904; CHECK: # %bb.0: 1905; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 1906; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] 1907; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 1908; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} 1909; CHECK-NEXT: retq 1910 %vec = load <4 x i64>, <4 x i64>* %vp 1911 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> 1912 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1913 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 1914 ret <2 x i64> %res 1915} 1916 1917define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) { 1918; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0: 1919; CHECK: # %bb.0: 1920; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 1921; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1] 1922; CHECK-NEXT: retq 1923 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> 1924 ret <4 x i64> %res 1925} 1926define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 1927; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0: 1928; CHECK: # %bb.0: 1929; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1930; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 1931; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1] 1932; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1933; CHECK-NEXT: retq 1934 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> 1935 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1936 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 1937 ret <4 x i64> %res 1938} 1939 1940define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) { 1941; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0: 1942; CHECK: # %bb.0: 1943; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1944; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 1945; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1] 1946; CHECK-NEXT: retq 1947 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> 1948 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1949 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 1950 ret <4 x i64> %res 1951} 1952define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 1953; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: 1954; CHECK: # %bb.0: 1955; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,4,6,1,6,4,6,1] 1956; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 1957; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 1958; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 1959; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} 1960; CHECK-NEXT: retq 1961 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1> 1962 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1963 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 1964 ret <4 x i64> %res 1965} 1966 1967define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) { 1968; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: 1969; CHECK: # %bb.0: 1970; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1] 1971; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 1972; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 1973; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1974; CHECK-NEXT: retq 1975 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1> 1976 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1977 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 1978 ret <4 x i64> %res 1979} 1980define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 1981; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: 1982; CHECK: # %bb.0: 1983; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,3,6,3,6,3,6,3] 1984; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1985; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 1986; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 1987; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} 1988; CHECK-NEXT: retq 1989 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3> 1990 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1991 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 1992 ret <4 x i64> %res 1993} 1994 1995define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { 1996; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: 1997; CHECK: # %bb.0: 1998; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3] 1999; CHECK-NEXT: # ymm2 = mem[0,1,0,1] 2000; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2001; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2002; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2003; CHECK-NEXT: retq 2004 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3> 2005 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2006 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2007 ret <4 x i64> %res 2008} 2009define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { 2010; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3: 2011; CHECK: # %bb.0: 2012; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,0,0,7,6,0,0,7] 2013; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 2014; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 2015; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2016; CHECK-NEXT: retq 2017 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> 2018 ret <4 x i64> %res 2019} 2020define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2021; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: 2022; CHECK: # %bb.0: 2023; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,0,0,7,6,0,0,7] 2024; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2025; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 2026; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 2027; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} 2028; CHECK-NEXT: retq 2029 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> 2030 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2031 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2032 ret <4 x i64> %res 2033} 2034 2035define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) { 2036; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: 2037; CHECK: # %bb.0: 2038; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7] 2039; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2040; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2041; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2042; CHECK-NEXT: retq 2043 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> 2044 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2045 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2046 ret <4 x i64> %res 2047} 2048define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2049; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: 2050; CHECK: # %bb.0: 2051; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,7,7,5,3,7,7,5] 2052; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2053; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 2054; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 2055; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} 2056; CHECK-NEXT: retq 2057 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> 2058 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2059 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2060 ret <4 x i64> %res 2061} 2062 2063define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { 2064; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: 2065; CHECK: # %bb.0: 2066; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5] 2067; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2068; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2069; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2070; CHECK-NEXT: retq 2071 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> 2072 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2073 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2074 ret <4 x i64> %res 2075} 2076define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2077; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5: 2078; CHECK: # %bb.0: 2079; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,1,0,6,4,1,0,6] 2080; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2081; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 2082; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 2083; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} 2084; CHECK-NEXT: retq 2085 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> 2086 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2087 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2088 ret <4 x i64> %res 2089} 2090 2091define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) { 2092; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5: 2093; CHECK: # %bb.0: 2094; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6] 2095; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2096; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2097; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2098; CHECK-NEXT: retq 2099 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> 2100 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2101 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2102 ret <4 x i64> %res 2103} 2104define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { 2105; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6: 2106; CHECK: # %bb.0: 2107; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,6,5,3,7,6,5,3] 2108; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 2109; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 2110; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2111; CHECK-NEXT: retq 2112 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> 2113 ret <4 x i64> %res 2114} 2115define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2116; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: 2117; CHECK: # %bb.0: 2118; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [7,6,5,3,7,6,5,3] 2119; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2120; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 2121; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 2122; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} 2123; CHECK-NEXT: retq 2124 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> 2125 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2126 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2127 ret <4 x i64> %res 2128} 2129 2130define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) { 2131; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: 2132; CHECK: # %bb.0: 2133; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3] 2134; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2135; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2136; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2137; CHECK-NEXT: retq 2138 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> 2139 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2140 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2141 ret <4 x i64> %res 2142} 2143define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2144; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: 2145; CHECK: # %bb.0: 2146; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 2147; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4] 2148; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 2149; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 2150; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} 2151; CHECK-NEXT: retq 2152 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> 2153 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2154 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2155 ret <4 x i64> %res 2156} 2157 2158define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { 2159; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: 2160; CHECK: # %bb.0: 2161; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 2162; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4] 2163; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2164; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} 2165; CHECK-NEXT: vmovdqa %ymm2, %ymm0 2166; CHECK-NEXT: retq 2167 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> 2168 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2169 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2170 ret <4 x i64> %res 2171} 2172define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) { 2173; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0: 2174; CHECK: # %bb.0: 2175; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 2176; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 2177; CHECK-NEXT: vzeroupper 2178; CHECK-NEXT: retq 2179 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> 2180 ret <2 x i64> %res 2181} 2182define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { 2183; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0: 2184; CHECK: # %bb.0: 2185; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 2186; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 2187; CHECK-NEXT: valignq {{.*#+}} xmm1 {%k1} = xmm3[1],xmm0[0] 2188; CHECK-NEXT: vmovdqa %xmm1, %xmm0 2189; CHECK-NEXT: vzeroupper 2190; CHECK-NEXT: retq 2191 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> 2192 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2193 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 2194 ret <2 x i64> %res 2195} 2196 2197define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) { 2198; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0: 2199; CHECK: # %bb.0: 2200; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 2201; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 2202; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm2[1],xmm0[0] 2203; CHECK-NEXT: vzeroupper 2204; CHECK-NEXT: retq 2205 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> 2206 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2207 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 2208 ret <2 x i64> %res 2209} 2210define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { 2211; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1: 2212; CHECK: # %bb.0: 2213; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2214; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] 2215; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 2216; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} 2217; CHECK-NEXT: vzeroupper 2218; CHECK-NEXT: retq 2219 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5> 2220 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2221 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 2222 ret <2 x i64> %res 2223} 2224 2225define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) { 2226; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1: 2227; CHECK: # %bb.0: 2228; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2229; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 2230; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3] 2231; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2232; CHECK-NEXT: vzeroupper 2233; CHECK-NEXT: retq 2234 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5> 2235 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2236 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 2237 ret <2 x i64> %res 2238} 2239define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) { 2240; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0: 2241; CHECK: # %bb.0: 2242; CHECK-NEXT: vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2] 2243; CHECK-NEXT: retq 2244 %vec = load <8 x i64>, <8 x i64>* %vp 2245 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2246 ret <4 x i64> %res 2247} 2248define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2249; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0: 2250; CHECK: # %bb.0: 2251; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2252; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[0,2,0,2] 2253; CHECK-NEXT: retq 2254 %vec = load <8 x i64>, <8 x i64>* %vp 2255 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2256 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2257 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2258 ret <4 x i64> %res 2259} 2260 2261define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) { 2262; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0: 2263; CHECK: # %bb.0: 2264; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2265; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[0,2,0,2] 2266; CHECK-NEXT: retq 2267 %vec = load <8 x i64>, <8 x i64>* %vp 2268 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2269 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2270 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2271 ret <4 x i64> %res 2272} 2273 2274define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2275; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: 2276; CHECK: # %bb.0: 2277; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2278; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4] 2279; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 2280; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2281; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2282; CHECK-NEXT: retq 2283 %vec = load <8 x i64>, <8 x i64>* %vp 2284 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0> 2285 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2286 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2287 ret <4 x i64> %res 2288} 2289 2290define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) { 2291; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: 2292; CHECK: # %bb.0: 2293; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2294; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4] 2295; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2296; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} 2297; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2298; CHECK-NEXT: retq 2299 %vec = load <8 x i64>, <8 x i64>* %vp 2300 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0> 2301 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2302 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2303 ret <4 x i64> %res 2304} 2305 2306define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2307; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: 2308; CHECK: # %bb.0: 2309; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2310; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1] 2311; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 2312; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2313; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2314; CHECK-NEXT: retq 2315 %vec = load <8 x i64>, <8 x i64>* %vp 2316 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> 2317 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2318 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2319 ret <4 x i64> %res 2320} 2321 2322define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) { 2323; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: 2324; CHECK: # %bb.0: 2325; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2326; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1] 2327; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2328; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} 2329; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2330; CHECK-NEXT: retq 2331 %vec = load <8 x i64>, <8 x i64>* %vp 2332 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> 2333 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2334 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2335 ret <4 x i64> %res 2336} 2337 2338define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) { 2339; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: 2340; CHECK: # %bb.0: 2341; CHECK-NEXT: vmovdqa (%rdi), %ymm1 2342; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2] 2343; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 2344; CHECK-NEXT: retq 2345 %vec = load <8 x i64>, <8 x i64>* %vp 2346 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> 2347 ret <4 x i64> %res 2348} 2349define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2350; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: 2351; CHECK: # %bb.0: 2352; CHECK-NEXT: vmovdqa (%rdi), %ymm2 2353; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2] 2354; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 2355; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2356; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2357; CHECK-NEXT: retq 2358 %vec = load <8 x i64>, <8 x i64>* %vp 2359 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> 2360 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2361 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2362 ret <4 x i64> %res 2363} 2364 2365define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) { 2366; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: 2367; CHECK: # %bb.0: 2368; CHECK-NEXT: vmovdqa (%rdi), %ymm2 2369; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] 2370; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2371; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} 2372; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2373; CHECK-NEXT: retq 2374 %vec = load <8 x i64>, <8 x i64>* %vp 2375 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> 2376 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2377 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2378 ret <4 x i64> %res 2379} 2380 2381define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2382; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4: 2383; CHECK: # %bb.0: 2384; CHECK-NEXT: vmovdqa (%rdi), %ymm2 2385; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,6,1] 2386; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 2387; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2388; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2389; CHECK-NEXT: retq 2390 %vec = load <8 x i64>, <8 x i64>* %vp 2391 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> 2392 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2393 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2394 ret <4 x i64> %res 2395} 2396 2397define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) { 2398; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: 2399; CHECK: # %bb.0: 2400; CHECK-NEXT: vmovdqa (%rdi), %ymm2 2401; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1] 2402; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2403; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} 2404; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2405; CHECK-NEXT: retq 2406 %vec = load <8 x i64>, <8 x i64>* %vp 2407 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> 2408 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2409 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2410 ret <4 x i64> %res 2411} 2412 2413define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2414; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: 2415; CHECK: # %bb.0: 2416; CHECK-NEXT: vmovdqa (%rdi), %ymm2 2417; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1] 2418; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 2419; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2420; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2421; CHECK-NEXT: retq 2422 %vec = load <8 x i64>, <8 x i64>* %vp 2423 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1> 2424 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2425 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2426 ret <4 x i64> %res 2427} 2428 2429define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) { 2430; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: 2431; CHECK: # %bb.0: 2432; CHECK-NEXT: vmovdqa (%rdi), %ymm2 2433; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] 2434; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2435; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} 2436; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2437; CHECK-NEXT: retq 2438 %vec = load <8 x i64>, <8 x i64>* %vp 2439 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1> 2440 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2441 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2442 ret <4 x i64> %res 2443} 2444 2445define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) { 2446; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: 2447; CHECK: # %bb.0: 2448; CHECK-NEXT: vmovdqa (%rdi), %ymm1 2449; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2] 2450; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 2451; CHECK-NEXT: retq 2452 %vec = load <8 x i64>, <8 x i64>* %vp 2453 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> 2454 ret <4 x i64> %res 2455} 2456define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2457; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: 2458; CHECK: # %bb.0: 2459; CHECK-NEXT: vmovdqa (%rdi), %ymm2 2460; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,2,3,2] 2461; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 2462; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2463; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2464; CHECK-NEXT: retq 2465 %vec = load <8 x i64>, <8 x i64>* %vp 2466 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> 2467 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2468 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2469 ret <4 x i64> %res 2470} 2471 2472define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) { 2473; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: 2474; CHECK: # %bb.0: 2475; CHECK-NEXT: vmovdqa (%rdi), %ymm2 2476; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2] 2477; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2478; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} 2479; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2480; CHECK-NEXT: retq 2481 %vec = load <8 x i64>, <8 x i64>* %vp 2482 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> 2483 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2484 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2485 ret <4 x i64> %res 2486} 2487 2488define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2489; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: 2490; CHECK: # %bb.0: 2491; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2492; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5] 2493; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 2494; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2495; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2496; CHECK-NEXT: retq 2497 %vec = load <8 x i64>, <8 x i64>* %vp 2498 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1> 2499 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2500 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2501 ret <4 x i64> %res 2502} 2503 2504define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) { 2505; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: 2506; CHECK: # %bb.0: 2507; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2508; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5] 2509; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2510; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} 2511; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2512; CHECK-NEXT: retq 2513 %vec = load <8 x i64>, <8 x i64>* %vp 2514 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1> 2515 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2516 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2517 ret <4 x i64> %res 2518} 2519 2520define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) { 2521; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: 2522; CHECK: # %bb.0: 2523; CHECK-NEXT: vmovaps 32(%rdi), %xmm0 2524; CHECK-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] 2525; CHECK-NEXT: retq 2526 %vec = load <8 x i64>, <8 x i64>* %vp 2527 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> 2528 ret <2 x i64> %res 2529} 2530define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { 2531; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: 2532; CHECK: # %bb.0: 2533; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2 2534; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] 2535; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 2536; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} 2537; CHECK-NEXT: retq 2538 %vec = load <8 x i64>, <8 x i64>* %vp 2539 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> 2540 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2541 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 2542 ret <2 x i64> %res 2543} 2544 2545define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) { 2546; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: 2547; CHECK: # %bb.0: 2548; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1 2549; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] 2550; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 2551; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} 2552; CHECK-NEXT: retq 2553 %vec = load <8 x i64>, <8 x i64>* %vp 2554 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> 2555 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2556 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 2557 ret <2 x i64> %res 2558} 2559 2560define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { 2561; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1: 2562; CHECK: # %bb.0: 2563; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2564; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] 2565; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 2566; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 2567; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} 2568; CHECK-NEXT: vzeroupper 2569; CHECK-NEXT: retq 2570 %vec = load <8 x i64>, <8 x i64>* %vp 2571 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2> 2572 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2573 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 2574 ret <2 x i64> %res 2575} 2576 2577define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) { 2578; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1: 2579; CHECK: # %bb.0: 2580; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 2581; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] 2582; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 2583; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 2584; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} 2585; CHECK-NEXT: vzeroupper 2586; CHECK-NEXT: retq 2587 %vec = load <8 x i64>, <8 x i64>* %vp 2588 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2> 2589 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2590 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 2591 ret <2 x i64> %res 2592} 2593 2594define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) { 2595; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0: 2596; CHECK: # %bb.0: 2597; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 2598; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1] 2599; CHECK-NEXT: vzeroupper 2600; CHECK-NEXT: retq 2601 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> 2602 ret <4 x float> %res 2603} 2604define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 2605; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0: 2606; CHECK: # %bb.0: 2607; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 2608; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 2609; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 2610; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1] 2611; CHECK-NEXT: vmovaps %xmm1, %xmm0 2612; CHECK-NEXT: vzeroupper 2613; CHECK-NEXT: retq 2614 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> 2615 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2616 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2617 ret <4 x float> %res 2618} 2619 2620define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) { 2621; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0: 2622; CHECK: # %bb.0: 2623; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 2624; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2625; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 2626; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1] 2627; CHECK-NEXT: vzeroupper 2628; CHECK-NEXT: retq 2629 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> 2630 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2631 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2632 ret <4 x float> %res 2633} 2634define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 2635; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1: 2636; CHECK: # %bb.0: 2637; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0] 2638; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 2639; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2640; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 2641; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} 2642; CHECK-NEXT: vzeroupper 2643; CHECK-NEXT: retq 2644 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0> 2645 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2646 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2647 ret <4 x float> %res 2648} 2649 2650define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) { 2651; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1: 2652; CHECK: # %bb.0: 2653; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0] 2654; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2655; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 2656; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} 2657; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2658; CHECK-NEXT: vzeroupper 2659; CHECK-NEXT: retq 2660 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0> 2661 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2662 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2663 ret <4 x float> %res 2664} 2665define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 2666; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2: 2667; CHECK: # %bb.0: 2668; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0] 2669; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 2670; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2671; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 2672; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} 2673; CHECK-NEXT: vzeroupper 2674; CHECK-NEXT: retq 2675 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0> 2676 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2677 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2678 ret <4 x float> %res 2679} 2680 2681define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) { 2682; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2: 2683; CHECK: # %bb.0: 2684; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0] 2685; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2686; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 2687; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} 2688; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2689; CHECK-NEXT: vzeroupper 2690; CHECK-NEXT: retq 2691 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0> 2692 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2693 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2694 ret <4 x float> %res 2695} 2696define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) { 2697; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3: 2698; CHECK: # %bb.0: 2699; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2] 2700; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 2701; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2702; CHECK-NEXT: vzeroupper 2703; CHECK-NEXT: retq 2704 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> 2705 ret <4 x float> %res 2706} 2707define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 2708; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3: 2709; CHECK: # %bb.0: 2710; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2] 2711; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 2712; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2713; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 2714; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} 2715; CHECK-NEXT: vzeroupper 2716; CHECK-NEXT: retq 2717 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> 2718 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2719 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2720 ret <4 x float> %res 2721} 2722 2723define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) { 2724; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3: 2725; CHECK: # %bb.0: 2726; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2] 2727; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2728; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 2729; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} 2730; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2731; CHECK-NEXT: vzeroupper 2732; CHECK-NEXT: retq 2733 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> 2734 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2735 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2736 ret <4 x float> %res 2737} 2738define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) { 2739; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: 2740; CHECK: # %bb.0: 2741; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 2742; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1] 2743; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 2744; CHECK-NEXT: retq 2745 %vec = load <8 x float>, <8 x float>* %vp 2746 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> 2747 ret <4 x float> %res 2748} 2749define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 2750; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: 2751; CHECK: # %bb.0: 2752; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 2753; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1] 2754; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 2755; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 2756; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 2757; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 2758; CHECK-NEXT: retq 2759 %vec = load <8 x float>, <8 x float>* %vp 2760 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> 2761 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2762 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2763 ret <4 x float> %res 2764} 2765 2766define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) { 2767; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: 2768; CHECK: # %bb.0: 2769; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 2770; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1] 2771; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2772; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 2773; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} 2774; CHECK-NEXT: vmovaps %xmm1, %xmm0 2775; CHECK-NEXT: retq 2776 %vec = load <8 x float>, <8 x float>* %vp 2777 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> 2778 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2779 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2780 ret <4 x float> %res 2781} 2782 2783define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 2784; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: 2785; CHECK: # %bb.0: 2786; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 2787; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2] 2788; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 2789; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 2790; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 2791; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 2792; CHECK-NEXT: retq 2793 %vec = load <8 x float>, <8 x float>* %vp 2794 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6> 2795 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2796 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2797 ret <4 x float> %res 2798} 2799 2800define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) { 2801; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: 2802; CHECK: # %bb.0: 2803; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 2804; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2] 2805; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2806; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 2807; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} 2808; CHECK-NEXT: vmovaps %xmm1, %xmm0 2809; CHECK-NEXT: retq 2810 %vec = load <8 x float>, <8 x float>* %vp 2811 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6> 2812 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2813 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2814 ret <4 x float> %res 2815} 2816 2817define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 2818; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: 2819; CHECK: # %bb.0: 2820; CHECK-NEXT: vmovaps (%rdi), %xmm2 2821; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7] 2822; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 2823; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 2824; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 2825; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 2826; CHECK-NEXT: retq 2827 %vec = load <8 x float>, <8 x float>* %vp 2828 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7> 2829 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2830 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2831 ret <4 x float> %res 2832} 2833 2834define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) { 2835; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: 2836; CHECK: # %bb.0: 2837; CHECK-NEXT: vmovaps (%rdi), %xmm2 2838; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7] 2839; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2840; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 2841; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} 2842; CHECK-NEXT: vmovaps %xmm1, %xmm0 2843; CHECK-NEXT: retq 2844 %vec = load <8 x float>, <8 x float>* %vp 2845 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7> 2846 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2847 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2848 ret <4 x float> %res 2849} 2850 2851define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) { 2852; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: 2853; CHECK: # %bb.0: 2854; CHECK-NEXT: vmovaps (%rdi), %xmm1 2855; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3] 2856; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 2857; CHECK-NEXT: retq 2858 %vec = load <8 x float>, <8 x float>* %vp 2859 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> 2860 ret <4 x float> %res 2861} 2862define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 2863; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: 2864; CHECK: # %bb.0: 2865; CHECK-NEXT: vmovaps (%rdi), %xmm2 2866; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3] 2867; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 2868; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 2869; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 2870; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 2871; CHECK-NEXT: retq 2872 %vec = load <8 x float>, <8 x float>* %vp 2873 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> 2874 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2875 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2876 ret <4 x float> %res 2877} 2878 2879define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) { 2880; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: 2881; CHECK: # %bb.0: 2882; CHECK-NEXT: vmovaps (%rdi), %xmm2 2883; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3] 2884; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2885; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 2886; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} 2887; CHECK-NEXT: vmovaps %xmm1, %xmm0 2888; CHECK-NEXT: retq 2889 %vec = load <8 x float>, <8 x float>* %vp 2890 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> 2891 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2892 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2893 ret <4 x float> %res 2894} 2895 2896define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) { 2897; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0: 2898; CHECK: # %bb.0: 2899; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7] 2900; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 2901; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 2902; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2903; CHECK-NEXT: retq 2904 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> 2905 ret <8 x float> %res 2906} 2907define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 2908; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0: 2909; CHECK: # %bb.0: 2910; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7] 2911; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2912; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 2913; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 2914; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 2915; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} 2916; CHECK-NEXT: retq 2917 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> 2918 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 2919 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 2920 ret <8 x float> %res 2921} 2922 2923define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) { 2924; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0: 2925; CHECK: # %bb.0: 2926; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7] 2927; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2928; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 2929; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 2930; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2931; CHECK-NEXT: retq 2932 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> 2933 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 2934 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 2935 ret <8 x float> %res 2936} 2937define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 2938; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1: 2939; CHECK: # %bb.0: 2940; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [10,12,3,12,4,15,1,14,10,12,3,12,4,15,1,14] 2941; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 2942; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 2943; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 2944; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 2945; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} 2946; CHECK-NEXT: retq 2947 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14> 2948 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 2949 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 2950 ret <8 x float> %res 2951} 2952 2953define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) { 2954; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1: 2955; CHECK: # %bb.0: 2956; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14] 2957; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2958; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 2959; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 2960; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2961; CHECK-NEXT: retq 2962 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14> 2963 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 2964 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 2965 ret <8 x float> %res 2966} 2967define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 2968; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: 2969; CHECK: # %bb.0: 2970; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] 2971; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 2972; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2973; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 2974; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} 2975; CHECK-NEXT: retq 2976 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4> 2977 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 2978 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 2979 ret <8 x float> %res 2980} 2981 2982define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { 2983; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: 2984; CHECK: # %bb.0: 2985; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] 2986; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2987; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 2988; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 2989; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2990; CHECK-NEXT: retq 2991 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4> 2992 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 2993 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 2994 ret <8 x float> %res 2995} 2996define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) { 2997; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3: 2998; CHECK: # %bb.0: 2999; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8] 3000; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 3001; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 3002; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3003; CHECK-NEXT: retq 3004 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> 3005 ret <8 x float> %res 3006} 3007define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 3008; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3: 3009; CHECK: # %bb.0: 3010; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8] 3011; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3012; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 3013; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 3014; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 3015; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} 3016; CHECK-NEXT: retq 3017 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> 3018 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3019 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3020 ret <8 x float> %res 3021} 3022 3023define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) { 3024; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3: 3025; CHECK: # %bb.0: 3026; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8] 3027; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3028; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 3029; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3030; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3031; CHECK-NEXT: retq 3032 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> 3033 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3034 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3035 ret <8 x float> %res 3036} 3037define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { 3038; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: 3039; CHECK: # %bb.0: 3040; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10] 3041; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 3042; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3043; CHECK-NEXT: vzeroupper 3044; CHECK-NEXT: retq 3045 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> 3046 ret <4 x float> %res 3047} 3048define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 3049; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: 3050; CHECK: # %bb.0: 3051; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10] 3052; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 3053; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3054; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 3055; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} 3056; CHECK-NEXT: vzeroupper 3057; CHECK-NEXT: retq 3058 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> 3059 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3060 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3061 ret <4 x float> %res 3062} 3063 3064define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) { 3065; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: 3066; CHECK: # %bb.0: 3067; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10] 3068; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3069; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3070; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3071; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3072; CHECK-NEXT: vzeroupper 3073; CHECK-NEXT: retq 3074 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> 3075 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3076 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3077 ret <4 x float> %res 3078} 3079define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 3080; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: 3081; CHECK: # %bb.0: 3082; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6] 3083; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 3084; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3085; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 3086; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} 3087; CHECK-NEXT: vzeroupper 3088; CHECK-NEXT: retq 3089 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6> 3090 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3091 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3092 ret <4 x float> %res 3093} 3094 3095define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) { 3096; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: 3097; CHECK: # %bb.0: 3098; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6] 3099; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3100; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3101; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3102; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3103; CHECK-NEXT: vzeroupper 3104; CHECK-NEXT: retq 3105 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6> 3106 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3107 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3108 ret <4 x float> %res 3109} 3110define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 3111; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2: 3112; CHECK: # %bb.0: 3113; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 3114; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5] 3115; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3116; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 3117; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} 3118; CHECK-NEXT: vmovaps %xmm1, %xmm0 3119; CHECK-NEXT: vzeroupper 3120; CHECK-NEXT: retq 3121 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5> 3122 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3123 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3124 ret <4 x float> %res 3125} 3126 3127define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) { 3128; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2: 3129; CHECK: # %bb.0: 3130; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 3131; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5] 3132; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3133; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3134; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} 3135; CHECK-NEXT: vzeroupper 3136; CHECK-NEXT: retq 3137 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5> 3138 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3139 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3140 ret <4 x float> %res 3141} 3142define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) { 3143; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3: 3144; CHECK: # %bb.0: 3145; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6] 3146; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 3147; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3148; CHECK-NEXT: vzeroupper 3149; CHECK-NEXT: retq 3150 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> 3151 ret <4 x float> %res 3152} 3153define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 3154; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3: 3155; CHECK: # %bb.0: 3156; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [10,2,11,6] 3157; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 3158; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3159; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 3160; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} 3161; CHECK-NEXT: vzeroupper 3162; CHECK-NEXT: retq 3163 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> 3164 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3165 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3166 ret <4 x float> %res 3167} 3168 3169define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) { 3170; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3: 3171; CHECK: # %bb.0: 3172; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [10,2,11,6] 3173; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3174; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3175; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3176; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3177; CHECK-NEXT: vzeroupper 3178; CHECK-NEXT: retq 3179 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> 3180 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3181 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3182 ret <4 x float> %res 3183} 3184define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) { 3185; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0: 3186; CHECK: # %bb.0: 3187; CHECK-NEXT: vmovaps (%rdi), %ymm1 3188; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] 3189; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0 3190; CHECK-NEXT: retq 3191 %vec = load <16 x float>, <16 x float>* %vp 3192 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> 3193 ret <8 x float> %res 3194} 3195define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 3196; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0: 3197; CHECK: # %bb.0: 3198; CHECK-NEXT: vmovaps (%rdi), %ymm2 3199; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4] 3200; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 3201; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3202; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 3203; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} 3204; CHECK-NEXT: retq 3205 %vec = load <16 x float>, <16 x float>* %vp 3206 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> 3207 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3208 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3209 ret <8 x float> %res 3210} 3211 3212define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) { 3213; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0: 3214; CHECK: # %bb.0: 3215; CHECK-NEXT: vmovaps (%rdi), %ymm2 3216; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] 3217; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3218; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 3219; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} 3220; CHECK-NEXT: vmovaps %ymm1, %ymm0 3221; CHECK-NEXT: retq 3222 %vec = load <16 x float>, <16 x float>* %vp 3223 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> 3224 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3225 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3226 ret <8 x float> %res 3227} 3228 3229define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 3230; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1: 3231; CHECK: # %bb.0: 3232; CHECK-NEXT: vmovaps (%rdi), %ymm2 3233; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8] 3234; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 3235; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3236; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 3237; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} 3238; CHECK-NEXT: retq 3239 %vec = load <16 x float>, <16 x float>* %vp 3240 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8> 3241 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3242 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3243 ret <8 x float> %res 3244} 3245 3246define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) { 3247; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1: 3248; CHECK: # %bb.0: 3249; CHECK-NEXT: vmovaps (%rdi), %ymm2 3250; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] 3251; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3252; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 3253; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} 3254; CHECK-NEXT: vmovaps %ymm1, %ymm0 3255; CHECK-NEXT: retq 3256 %vec = load <16 x float>, <16 x float>* %vp 3257 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8> 3258 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3259 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3260 ret <8 x float> %res 3261} 3262 3263define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 3264; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: 3265; CHECK: # %bb.0: 3266; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3267; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] 3268; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 3269; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3270; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 3271; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} 3272; CHECK-NEXT: retq 3273 %vec = load <16 x float>, <16 x float>* %vp 3274 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9> 3275 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3276 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3277 ret <8 x float> %res 3278} 3279 3280define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) { 3281; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: 3282; CHECK: # %bb.0: 3283; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3284; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] 3285; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3286; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 3287; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} 3288; CHECK-NEXT: vmovaps %ymm1, %ymm0 3289; CHECK-NEXT: retq 3290 %vec = load <16 x float>, <16 x float>* %vp 3291 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9> 3292 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3293 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3294 ret <8 x float> %res 3295} 3296 3297define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) { 3298; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3: 3299; CHECK: # %bb.0: 3300; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 3301; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9] 3302; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0 3303; CHECK-NEXT: retq 3304 %vec = load <16 x float>, <16 x float>* %vp 3305 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> 3306 ret <8 x float> %res 3307} 3308define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 3309; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3: 3310; CHECK: # %bb.0: 3311; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3312; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9] 3313; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 3314; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3315; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 3316; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} 3317; CHECK-NEXT: retq 3318 %vec = load <16 x float>, <16 x float>* %vp 3319 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> 3320 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3321 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3322 ret <8 x float> %res 3323} 3324 3325define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) { 3326; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3: 3327; CHECK: # %bb.0: 3328; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3329; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9] 3330; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3331; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 3332; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} 3333; CHECK-NEXT: vmovaps %ymm1, %ymm0 3334; CHECK-NEXT: retq 3335 %vec = load <16 x float>, <16 x float>* %vp 3336 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> 3337 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3338 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3339 ret <8 x float> %res 3340} 3341 3342define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) { 3343; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0: 3344; CHECK: # %bb.0: 3345; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3] 3346; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3] 3347; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 3348; CHECK-NEXT: vzeroupper 3349; CHECK-NEXT: retq 3350 %vec = load <16 x float>, <16 x float>* %vp 3351 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> 3352 ret <4 x float> %res 3353} 3354define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 3355; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0: 3356; CHECK: # %bb.0: 3357; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] 3358; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3] 3359; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 3360; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3361; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3362; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 3363; CHECK-NEXT: vzeroupper 3364; CHECK-NEXT: retq 3365 %vec = load <16 x float>, <16 x float>* %vp 3366 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> 3367 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3368 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3369 ret <4 x float> %res 3370} 3371 3372define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) { 3373; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0: 3374; CHECK: # %bb.0: 3375; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] 3376; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3] 3377; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3378; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 3379; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} 3380; CHECK-NEXT: vmovaps %xmm1, %xmm0 3381; CHECK-NEXT: vzeroupper 3382; CHECK-NEXT: retq 3383 %vec = load <16 x float>, <16 x float>* %vp 3384 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> 3385 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3386 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3387 ret <4 x float> %res 3388} 3389 3390define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 3391; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: 3392; CHECK: # %bb.0: 3393; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3394; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u> 3395; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 3396; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3397; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3398; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 3399; CHECK-NEXT: vzeroupper 3400; CHECK-NEXT: retq 3401 %vec = load <16 x float>, <16 x float>* %vp 3402 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7> 3403 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3404 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3405 ret <4 x float> %res 3406} 3407 3408define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) { 3409; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: 3410; CHECK: # %bb.0: 3411; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3412; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u> 3413; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3414; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 3415; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} 3416; CHECK-NEXT: vmovaps %xmm1, %xmm0 3417; CHECK-NEXT: vzeroupper 3418; CHECK-NEXT: retq 3419 %vec = load <16 x float>, <16 x float>* %vp 3420 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7> 3421 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3422 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3423 ret <4 x float> %res 3424} 3425 3426define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 3427; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: 3428; CHECK: # %bb.0: 3429; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3430; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [60129542148,60129542148,60129542148,60129542148] 3431; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 3432; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3433; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3434; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 3435; CHECK-NEXT: vzeroupper 3436; CHECK-NEXT: retq 3437 %vec = load <16 x float>, <16 x float>* %vp 3438 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6> 3439 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3440 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3441 ret <4 x float> %res 3442} 3443 3444define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) { 3445; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: 3446; CHECK: # %bb.0: 3447; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3448; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [60129542148,60129542148,60129542148,60129542148] 3449; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3450; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 3451; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} 3452; CHECK-NEXT: vmovaps %xmm1, %xmm0 3453; CHECK-NEXT: vzeroupper 3454; CHECK-NEXT: retq 3455 %vec = load <16 x float>, <16 x float>* %vp 3456 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6> 3457 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3458 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3459 ret <4 x float> %res 3460} 3461 3462define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) { 3463; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: 3464; CHECK: # %bb.0: 3465; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9] 3466; CHECK-NEXT: vmovaps (%rdi), %ymm0 3467; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0 3468; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3469; CHECK-NEXT: vzeroupper 3470; CHECK-NEXT: retq 3471 %vec = load <16 x float>, <16 x float>* %vp 3472 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> 3473 ret <4 x float> %res 3474} 3475define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 3476; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: 3477; CHECK: # %bb.0: 3478; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9] 3479; CHECK-NEXT: vmovaps (%rdi), %ymm3 3480; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3 3481; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3482; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3483; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 3484; CHECK-NEXT: vzeroupper 3485; CHECK-NEXT: retq 3486 %vec = load <16 x float>, <16 x float>* %vp 3487 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> 3488 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3489 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3490 ret <4 x float> %res 3491} 3492 3493define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) { 3494; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: 3495; CHECK: # %bb.0: 3496; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9] 3497; CHECK-NEXT: vmovaps (%rdi), %ymm1 3498; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3499; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 3500; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} 3501; CHECK-NEXT: vmovaps %xmm1, %xmm0 3502; CHECK-NEXT: vzeroupper 3503; CHECK-NEXT: retq 3504 %vec = load <16 x float>, <16 x float>* %vp 3505 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> 3506 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3507 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3508 ret <4 x float> %res 3509} 3510 3511define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) { 3512; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0: 3513; CHECK: # %bb.0: 3514; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] 3515; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3516; CHECK-NEXT: vzeroupper 3517; CHECK-NEXT: retq 3518 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3519 ret <2 x double> %res 3520} 3521define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 3522; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0: 3523; CHECK: # %bb.0: 3524; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] 3525; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3526; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 3527; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} 3528; CHECK-NEXT: vzeroupper 3529; CHECK-NEXT: retq 3530 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3531 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3532 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3533 ret <2 x double> %res 3534} 3535 3536define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) { 3537; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0: 3538; CHECK: # %bb.0: 3539; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3540; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 3541; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3] 3542; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3543; CHECK-NEXT: vzeroupper 3544; CHECK-NEXT: retq 3545 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3546 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3547 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3548 ret <2 x double> %res 3549} 3550define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 3551; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1: 3552; CHECK: # %bb.0: 3553; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] 3554; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3555; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 3556; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} 3557; CHECK-NEXT: vzeroupper 3558; CHECK-NEXT: retq 3559 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3> 3560 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3561 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3562 ret <2 x double> %res 3563} 3564 3565define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) { 3566; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1: 3567; CHECK: # %bb.0: 3568; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3569; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 3570; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3] 3571; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3572; CHECK-NEXT: vzeroupper 3573; CHECK-NEXT: retq 3574 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3> 3575 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3576 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3577 ret <2 x double> %res 3578} 3579define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) { 3580; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0: 3581; CHECK: # %bb.0: 3582; CHECK-NEXT: vmovaps (%rdi), %xmm0 3583; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3] 3584; CHECK-NEXT: retq 3585 %vec = load <4 x double>, <4 x double>* %vp 3586 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> 3587 ret <2 x double> %res 3588} 3589define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { 3590; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0: 3591; CHECK: # %bb.0: 3592; CHECK-NEXT: vmovapd (%rdi), %xmm2 3593; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1] 3594; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3595; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 3596; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} 3597; CHECK-NEXT: retq 3598 %vec = load <4 x double>, <4 x double>* %vp 3599 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> 3600 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3601 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3602 ret <2 x double> %res 3603} 3604 3605define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) { 3606; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0: 3607; CHECK: # %bb.0: 3608; CHECK-NEXT: vmovapd (%rdi), %xmm1 3609; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1] 3610; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3611; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 3612; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z} 3613; CHECK-NEXT: retq 3614 %vec = load <4 x double>, <4 x double>* %vp 3615 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> 3616 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3617 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3618 ret <2 x double> %res 3619} 3620 3621define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { 3622; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1: 3623; CHECK: # %bb.0: 3624; CHECK-NEXT: vmovapd 16(%rdi), %xmm2 3625; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3626; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 3627; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] 3628; CHECK-NEXT: retq 3629 %vec = load <4 x double>, <4 x double>* %vp 3630 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3631 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3632 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3633 ret <2 x double> %res 3634} 3635 3636define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) { 3637; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1: 3638; CHECK: # %bb.0: 3639; CHECK-NEXT: vmovapd 16(%rdi), %xmm1 3640; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3641; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 3642; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] 3643; CHECK-NEXT: retq 3644 %vec = load <4 x double>, <4 x double>* %vp 3645 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3646 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3647 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3648 ret <2 x double> %res 3649} 3650 3651define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { 3652; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: 3653; CHECK: # %bb.0: 3654; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,3,7,3,7,3,7,3] 3655; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3656; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 3657; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3658; CHECK-NEXT: retq 3659 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> 3660 ret <4 x double> %res 3661} 3662define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3663; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: 3664; CHECK: # %bb.0: 3665; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,3,7,3,7,3,7,3] 3666; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 3667; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 3668; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 3669; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 3670; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} 3671; CHECK-NEXT: retq 3672 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> 3673 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3674 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3675 ret <4 x double> %res 3676} 3677 3678define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) { 3679; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: 3680; CHECK: # %bb.0: 3681; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,3,7,3] 3682; CHECK-NEXT: # ymm2 = mem[0,1,0,1] 3683; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3684; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 3685; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 3686; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3687; CHECK-NEXT: retq 3688 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> 3689 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3690 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3691 ret <4 x double> %res 3692} 3693define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3694; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1: 3695; CHECK: # %bb.0: 3696; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,7,6,2,0,7,6] 3697; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3698; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 3699; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 3700; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 3701; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} 3702; CHECK-NEXT: retq 3703 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6> 3704 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3705 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3706 ret <4 x double> %res 3707} 3708 3709define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) { 3710; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1: 3711; CHECK: # %bb.0: 3712; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6] 3713; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3714; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 3715; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 3716; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3717; CHECK-NEXT: retq 3718 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6> 3719 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3720 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3721 ret <4 x double> %res 3722} 3723define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3724; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2: 3725; CHECK: # %bb.0: 3726; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3727; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 3728; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0] 3729; CHECK-NEXT: vmovapd %ymm1, %ymm0 3730; CHECK-NEXT: retq 3731 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0> 3732 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3733 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3734 ret <4 x double> %res 3735} 3736 3737define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) { 3738; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2: 3739; CHECK: # %bb.0: 3740; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3741; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 3742; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0] 3743; CHECK-NEXT: retq 3744 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0> 3745 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3746 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3747 ret <4 x double> %res 3748} 3749define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) { 3750; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3: 3751; CHECK: # %bb.0: 3752; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,2,1,4,0,2,1,4] 3753; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 3754; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 3755; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3756; CHECK-NEXT: retq 3757 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> 3758 ret <4 x double> %res 3759} 3760define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3761; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3: 3762; CHECK: # %bb.0: 3763; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,0,2,1,4] 3764; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3765; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 3766; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 3767; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 3768; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} 3769; CHECK-NEXT: retq 3770 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> 3771 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3772 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3773 ret <4 x double> %res 3774} 3775 3776define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) { 3777; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3: 3778; CHECK: # %bb.0: 3779; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4] 3780; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3781; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 3782; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 3783; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3784; CHECK-NEXT: retq 3785 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> 3786 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3787 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3788 ret <4 x double> %res 3789} 3790define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3791; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: 3792; CHECK: # %bb.0: 3793; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 3794; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5] 3795; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 3796; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 3797; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 3798; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} 3799; CHECK-NEXT: retq 3800 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 3801 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3802 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3803 ret <4 x double> %res 3804} 3805 3806define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) { 3807; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: 3808; CHECK: # %bb.0: 3809; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 3810; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5] 3811; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 3812; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 3813; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} 3814; CHECK-NEXT: vmovapd %ymm2, %ymm0 3815; CHECK-NEXT: retq 3816 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 3817 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3818 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3819 ret <4 x double> %res 3820} 3821define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3822; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5: 3823; CHECK: # %bb.0: 3824; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,6,2,2,2,6,2,2] 3825; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3826; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 3827; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 3828; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 3829; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} 3830; CHECK-NEXT: retq 3831 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2> 3832 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3833 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3834 ret <4 x double> %res 3835} 3836 3837define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) { 3838; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5: 3839; CHECK: # %bb.0: 3840; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2] 3841; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3842; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 3843; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 3844; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3845; CHECK-NEXT: retq 3846 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2> 3847 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3848 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3849 ret <4 x double> %res 3850} 3851define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { 3852; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6: 3853; CHECK: # %bb.0: 3854; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,8,7,8,5,8,7,8] 3855; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 3856; CHECK-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 3857; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3858; CHECK-NEXT: retq 3859 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> 3860 ret <4 x double> %res 3861} 3862define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3863; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: 3864; CHECK: # %bb.0: 3865; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,8,7,8,5,8,7,8] 3866; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3867; CHECK-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3 3868; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 3869; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 3870; CHECK-NEXT: vblendmpd %ymm3, %ymm1, %ymm0 {%k1} 3871; CHECK-NEXT: retq 3872 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> 3873 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3874 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3875 ret <4 x double> %res 3876} 3877 3878define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) { 3879; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: 3880; CHECK: # %bb.0: 3881; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [5,8,7,8] 3882; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3883; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 3884; CHECK-NEXT: vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z} 3885; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3886; CHECK-NEXT: retq 3887 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> 3888 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3889 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3890 ret <4 x double> %res 3891} 3892define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3893; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: 3894; CHECK: # %bb.0: 3895; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,5,0,6,3,5,0,6] 3896; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 3897; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 3898; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 3899; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 3900; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} 3901; CHECK-NEXT: retq 3902 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6> 3903 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3904 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3905 ret <4 x double> %res 3906} 3907 3908define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) { 3909; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: 3910; CHECK: # %bb.0: 3911; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6] 3912; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3913; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 3914; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 3915; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3916; CHECK-NEXT: retq 3917 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6> 3918 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3919 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3920 ret <4 x double> %res 3921} 3922define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) { 3923; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0: 3924; CHECK: # %bb.0: 3925; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 3926; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 3927; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3928; CHECK-NEXT: vzeroupper 3929; CHECK-NEXT: retq 3930 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> 3931 ret <2 x double> %res 3932} 3933define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 3934; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: 3935; CHECK: # %bb.0: 3936; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 3937; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3 3938; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 3939; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 3940; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0] 3941; CHECK-NEXT: vmovapd %xmm1, %xmm0 3942; CHECK-NEXT: vzeroupper 3943; CHECK-NEXT: retq 3944 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> 3945 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3946 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3947 ret <2 x double> %res 3948} 3949 3950define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) { 3951; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: 3952; CHECK: # %bb.0: 3953; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 3954; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2 3955; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3956; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 3957; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0] 3958; CHECK-NEXT: vzeroupper 3959; CHECK-NEXT: retq 3960 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> 3961 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3962 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3963 ret <2 x double> %res 3964} 3965define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 3966; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1: 3967; CHECK: # %bb.0: 3968; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [3,7] 3969; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 3970; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3971; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 3972; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} 3973; CHECK-NEXT: vzeroupper 3974; CHECK-NEXT: retq 3975 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7> 3976 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3977 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3978 ret <2 x double> %res 3979} 3980 3981define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) { 3982; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1: 3983; CHECK: # %bb.0: 3984; CHECK-NEXT: vmovapd {{.*#+}} xmm2 = [3,7] 3985; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3986; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 3987; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 3988; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3989; CHECK-NEXT: vzeroupper 3990; CHECK-NEXT: retq 3991 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7> 3992 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3993 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3994 ret <2 x double> %res 3995} 3996define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) { 3997; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0: 3998; CHECK: # %bb.0: 3999; CHECK-NEXT: vmovapd (%rdi), %ymm1 4000; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2] 4001; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 4002; CHECK-NEXT: retq 4003 %vec = load <8 x double>, <8 x double>* %vp 4004 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> 4005 ret <4 x double> %res 4006} 4007define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 4008; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0: 4009; CHECK: # %bb.0: 4010; CHECK-NEXT: vmovapd (%rdi), %ymm2 4011; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,7,2] 4012; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 4013; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4014; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4015; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4016; CHECK-NEXT: retq 4017 %vec = load <8 x double>, <8 x double>* %vp 4018 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> 4019 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4020 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4021 ret <4 x double> %res 4022} 4023 4024define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) { 4025; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0: 4026; CHECK: # %bb.0: 4027; CHECK-NEXT: vmovapd (%rdi), %ymm2 4028; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2] 4029; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4030; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4031; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} 4032; CHECK-NEXT: vmovapd %ymm1, %ymm0 4033; CHECK-NEXT: retq 4034 %vec = load <8 x double>, <8 x double>* %vp 4035 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> 4036 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4037 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4038 ret <4 x double> %res 4039} 4040 4041define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 4042; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: 4043; CHECK: # %bb.0: 4044; CHECK-NEXT: vmovapd (%rdi), %ymm2 4045; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6] 4046; CHECK-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3 4047; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4048; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4049; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4050; CHECK-NEXT: retq 4051 %vec = load <8 x double>, <8 x double>* %vp 4052 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4> 4053 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4054 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4055 ret <4 x double> %res 4056} 4057 4058define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) { 4059; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: 4060; CHECK: # %bb.0: 4061; CHECK-NEXT: vmovapd (%rdi), %ymm2 4062; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6] 4063; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4064; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4065; CHECK-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z} 4066; CHECK-NEXT: vmovapd %ymm1, %ymm0 4067; CHECK-NEXT: retq 4068 %vec = load <8 x double>, <8 x double>* %vp 4069 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4> 4070 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4071 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4072 ret <4 x double> %res 4073} 4074 4075define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 4076; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: 4077; CHECK: # %bb.0: 4078; CHECK-NEXT: vmovapd (%rdi), %ymm2 4079; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4] 4080; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 4081; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4082; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4083; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4084; CHECK-NEXT: retq 4085 %vec = load <8 x double>, <8 x double>* %vp 4086 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 4087 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4088 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4089 ret <4 x double> %res 4090} 4091 4092define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) { 4093; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: 4094; CHECK: # %bb.0: 4095; CHECK-NEXT: vmovapd (%rdi), %ymm2 4096; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4] 4097; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4098; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4099; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} 4100; CHECK-NEXT: vmovapd %ymm1, %ymm0 4101; CHECK-NEXT: retq 4102 %vec = load <8 x double>, <8 x double>* %vp 4103 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 4104 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4105 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4106 ret <4 x double> %res 4107} 4108 4109define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) { 4110; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: 4111; CHECK: # %bb.0: 4112; CHECK-NEXT: vmovapd (%rdi), %ymm1 4113; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0] 4114; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 4115; CHECK-NEXT: retq 4116 %vec = load <8 x double>, <8 x double>* %vp 4117 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> 4118 ret <4 x double> %res 4119} 4120define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 4121; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: 4122; CHECK: # %bb.0: 4123; CHECK-NEXT: vmovapd (%rdi), %ymm2 4124; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [4,2,1,0] 4125; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 4126; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4127; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4128; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4129; CHECK-NEXT: retq 4130 %vec = load <8 x double>, <8 x double>* %vp 4131 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> 4132 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4133 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4134 ret <4 x double> %res 4135} 4136 4137define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) { 4138; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: 4139; CHECK: # %bb.0: 4140; CHECK-NEXT: vmovapd (%rdi), %ymm2 4141; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0] 4142; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4143; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4144; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} 4145; CHECK-NEXT: vmovapd %ymm1, %ymm0 4146; CHECK-NEXT: retq 4147 %vec = load <8 x double>, <8 x double>* %vp 4148 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> 4149 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4150 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4151 ret <4 x double> %res 4152} 4153 4154define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 4155; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4: 4156; CHECK: # %bb.0: 4157; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 4158; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,4,1,5] 4159; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 4160; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4161; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4162; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4163; CHECK-NEXT: retq 4164 %vec = load <8 x double>, <8 x double>* %vp 4165 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1> 4166 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4167 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4168 ret <4 x double> %res 4169} 4170 4171define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) { 4172; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4: 4173; CHECK: # %bb.0: 4174; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 4175; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5] 4176; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4177; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4178; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} 4179; CHECK-NEXT: vmovapd %ymm1, %ymm0 4180; CHECK-NEXT: retq 4181 %vec = load <8 x double>, <8 x double>* %vp 4182 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1> 4183 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4184 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4185 ret <4 x double> %res 4186} 4187 4188define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 4189; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: 4190; CHECK: # %bb.0: 4191; CHECK-NEXT: vmovapd (%rdi), %ymm2 4192; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1] 4193; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4194; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4195; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1} 4196; CHECK-NEXT: retq 4197 %vec = load <8 x double>, <8 x double>* %vp 4198 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5> 4199 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4200 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4201 ret <4 x double> %res 4202} 4203 4204define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) { 4205; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: 4206; CHECK: # %bb.0: 4207; CHECK-NEXT: vmovapd (%rdi), %ymm1 4208; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1] 4209; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4210; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 4211; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} 4212; CHECK-NEXT: retq 4213 %vec = load <8 x double>, <8 x double>* %vp 4214 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5> 4215 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4216 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4217 ret <4 x double> %res 4218} 4219 4220define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) { 4221; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: 4222; CHECK: # %bb.0: 4223; CHECK-NEXT: vmovapd 32(%rdi), %ymm1 4224; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,4,1] 4225; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0 4226; CHECK-NEXT: retq 4227 %vec = load <8 x double>, <8 x double>* %vp 4228 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 4229 ret <4 x double> %res 4230} 4231define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 4232; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: 4233; CHECK: # %bb.0: 4234; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 4235; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,4,1] 4236; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 4237; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4238; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4239; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4240; CHECK-NEXT: retq 4241 %vec = load <8 x double>, <8 x double>* %vp 4242 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 4243 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4244 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4245 ret <4 x double> %res 4246} 4247 4248define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) { 4249; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: 4250; CHECK: # %bb.0: 4251; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 4252; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,1] 4253; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4254; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4255; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} 4256; CHECK-NEXT: vmovapd %ymm1, %ymm0 4257; CHECK-NEXT: retq 4258 %vec = load <8 x double>, <8 x double>* %vp 4259 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 4260 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4261 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4262 ret <4 x double> %res 4263} 4264 4265define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 4266; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7: 4267; CHECK: # %bb.0: 4268; CHECK-NEXT: vbroadcastsd 40(%rdi), %ymm2 4269; CHECK-NEXT: vblendpd $5, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0],ymm2[1],mem[2],ymm2[3] 4270; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4271; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4272; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1} 4273; CHECK-NEXT: retq 4274 %vec = load <8 x double>, <8 x double>* %vp 4275 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5> 4276 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4277 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4278 ret <4 x double> %res 4279} 4280 4281define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) { 4282; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7: 4283; CHECK: # %bb.0: 4284; CHECK-NEXT: vbroadcastsd 40(%rdi), %ymm1 4285; CHECK-NEXT: vblendpd $5, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0],ymm1[1],mem[2],ymm1[3] 4286; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4287; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 4288; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z} 4289; CHECK-NEXT: retq 4290 %vec = load <8 x double>, <8 x double>* %vp 4291 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5> 4292 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4293 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4294 ret <4 x double> %res 4295} 4296 4297define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) { 4298; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0: 4299; CHECK: # %bb.0: 4300; CHECK-NEXT: vmovapd (%rdi), %xmm0 4301; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0] 4302; CHECK-NEXT: retq 4303 %vec = load <8 x double>, <8 x double>* %vp 4304 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> 4305 ret <2 x double> %res 4306} 4307define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { 4308; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0: 4309; CHECK: # %bb.0: 4310; CHECK-NEXT: vmovapd (%rdi), %xmm2 4311; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4312; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 4313; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0] 4314; CHECK-NEXT: retq 4315 %vec = load <8 x double>, <8 x double>* %vp 4316 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> 4317 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4318 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 4319 ret <2 x double> %res 4320} 4321 4322define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) { 4323; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0: 4324; CHECK: # %bb.0: 4325; CHECK-NEXT: vmovapd (%rdi), %xmm1 4326; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4327; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 4328; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0] 4329; CHECK-NEXT: retq 4330 %vec = load <8 x double>, <8 x double>* %vp 4331 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> 4332 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4333 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 4334 ret <2 x double> %res 4335} 4336 4337define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { 4338; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: 4339; CHECK: # %bb.0: 4340; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0] 4341; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4342; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 4343; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] 4344; CHECK-NEXT: retq 4345 %vec = load <8 x double>, <8 x double>* %vp 4346 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4> 4347 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4348 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 4349 ret <2 x double> %res 4350} 4351 4352define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) { 4353; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: 4354; CHECK: # %bb.0: 4355; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0] 4356; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4357; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 4358; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] 4359; CHECK-NEXT: retq 4360 %vec = load <8 x double>, <8 x double>* %vp 4361 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4> 4362 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4363 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 4364 ret <2 x double> %res 4365} 4366 4367; PR35977 4368define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) { 4369; CHECK-LABEL: test_zext_v8i8_to_v8i16: 4370; CHECK: # %bb.0: 4371; CHECK-NEXT: vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 4372; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 4373; CHECK-NEXT: vmovdqa %xmm0, (%rsi) 4374; CHECK-NEXT: retq 4375 %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0 4376 %tmp2 = load <8 x i8>, <8 x i8>* %tmp 4377 %tmp3 = extractelement <8 x i8> %tmp2, i32 0 4378 %tmp4 = zext i8 %tmp3 to i16 4379 %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0 4380 %tmp6 = extractelement <8 x i8> %tmp2, i32 1 4381 %tmp7 = zext i8 %tmp6 to i16 4382 %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1 4383 %tmp9 = extractelement <8 x i8> %tmp2, i32 2 4384 %tmp10 = zext i8 %tmp9 to i16 4385 %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2 4386 %tmp12 = extractelement <8 x i8> %tmp2, i32 3 4387 %tmp13 = zext i8 %tmp12 to i16 4388 %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3 4389 %tmp15 = extractelement <8 x i8> %tmp2, i32 4 4390 %tmp16 = zext i8 %tmp15 to i16 4391 %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4 4392 %tmp18 = extractelement <8 x i8> %tmp2, i32 5 4393 %tmp19 = zext i8 %tmp18 to i16 4394 %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5 4395 %tmp21 = extractelement <8 x i8> %tmp2, i32 6 4396 %tmp22 = zext i8 %tmp21 to i16 4397 %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6 4398 %tmp24 = extractelement <8 x i8> %tmp2, i32 7 4399 %tmp25 = zext i8 %tmp24 to i16 4400 %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7 4401 %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 4402 %tmp28 = getelementptr <8 x i16>, <8 x i16>* %arg1, i32 0 4403 store <8 x i16> %tmp27, <8 x i16>* %tmp28 4404 ret void 4405} 4406