1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s 3 4define <4 x float> @test_4xfloat_dup_high(<4 x float> %vec) { 5; CHECK-LABEL: test_4xfloat_dup_high: 6; CHECK: # %bb.0: 7; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 8; CHECK-NEXT: retq 9 %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 10 ret <4 x float> %res 11} 12define <4 x float> @test_masked_4xfloat_dup_high_mask0(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 13; CHECK-LABEL: test_masked_4xfloat_dup_high_mask0: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 16; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 17; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3] 18; CHECK-NEXT: vmovaps %xmm1, %xmm0 19; CHECK-NEXT: retq 20 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 21 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 22 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 23 ret <4 x float> %res 24} 25 26define <4 x float> @test_masked_z_4xfloat_dup_high_mask0(<4 x float> %vec, <4 x float> %mask) { 27; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask0: 28; CHECK: # %bb.0: 29; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 30; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 31; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 32; CHECK-NEXT: retq 33 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 34 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 35 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 36 ret <4 x float> %res 37} 38define <4 x float> @test_masked_4xfloat_dup_high_mask1(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 39; CHECK-LABEL: test_masked_4xfloat_dup_high_mask1: 40; CHECK: # %bb.0: 41; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 42; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 43; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3] 44; CHECK-NEXT: vmovaps %xmm1, %xmm0 45; CHECK-NEXT: retq 46 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 47 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 48 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 49 ret <4 x float> %res 50} 51 52define <4 x float> @test_masked_z_4xfloat_dup_high_mask1(<4 x float> %vec, <4 x float> %mask) { 53; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask1: 54; CHECK: # %bb.0: 55; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 56; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 57; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 58; CHECK-NEXT: retq 59 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 60 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 61 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 62 ret <4 x float> %res 63} 64define <4 x float> @test_masked_4xfloat_dup_high_mask2(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 65; CHECK-LABEL: test_masked_4xfloat_dup_high_mask2: 66; CHECK: # %bb.0: 67; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 68; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 69; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3] 70; CHECK-NEXT: vmovaps %xmm1, %xmm0 71; CHECK-NEXT: retq 72 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 73 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 74 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 75 ret <4 x float> %res 76} 77 78define <4 x float> @test_masked_z_4xfloat_dup_high_mask2(<4 x float> %vec, <4 x float> %mask) { 79; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask2: 80; CHECK: # %bb.0: 81; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 82; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 83; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 84; CHECK-NEXT: retq 85 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 86 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 87 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 88 ret <4 x float> %res 89} 90define <4 x float> @test_masked_4xfloat_dup_high_mask3(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 91; CHECK-LABEL: test_masked_4xfloat_dup_high_mask3: 92; CHECK: # %bb.0: 93; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 94; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 95; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3] 96; CHECK-NEXT: vmovaps %xmm1, %xmm0 97; CHECK-NEXT: retq 98 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 99 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 100 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 101 ret <4 x float> %res 102} 103 104define <4 x float> @test_masked_z_4xfloat_dup_high_mask3(<4 x float> %vec, <4 x float> %mask) { 105; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask3: 106; CHECK: # %bb.0: 107; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 108; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 109; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 110; CHECK-NEXT: retq 111 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 112 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 113 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 114 ret <4 x float> %res 115} 116define <4 x float> @test_masked_4xfloat_dup_high_mask4(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 117; CHECK-LABEL: test_masked_4xfloat_dup_high_mask4: 118; CHECK: # %bb.0: 119; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 120; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 121; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3] 122; CHECK-NEXT: vmovaps %xmm1, %xmm0 123; CHECK-NEXT: retq 124 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 125 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 126 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 127 ret <4 x float> %res 128} 129 130define <4 x float> @test_masked_z_4xfloat_dup_high_mask4(<4 x float> %vec, <4 x float> %mask) { 131; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask4: 132; CHECK: # %bb.0: 133; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 134; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 135; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 136; CHECK-NEXT: retq 137 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 138 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 139 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 140 ret <4 x float> %res 141} 142define <4 x float> @test_4xfloat_dup_high_mem(<4 x float>* %vp) { 143; CHECK-LABEL: test_4xfloat_dup_high_mem: 144; CHECK: # %bb.0: 145; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = mem[1,1,3,3] 146; CHECK-NEXT: retq 147 %vec = load <4 x float>, <4 x float>* %vp 148 %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 149 ret <4 x float> %res 150} 151define <4 x float> @test_masked_4xfloat_dup_high_mem_mask0(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 152; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask0: 153; CHECK: # %bb.0: 154; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 155; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 156; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3] 157; CHECK-NEXT: retq 158 %vec = load <4 x float>, <4 x float>* %vp 159 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 160 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 161 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 162 ret <4 x float> %res 163} 164 165define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask0(<4 x float>* %vp, <4 x float> %mask) { 166; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask0: 167; CHECK: # %bb.0: 168; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 169; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 170; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3] 171; CHECK-NEXT: retq 172 %vec = load <4 x float>, <4 x float>* %vp 173 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 174 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 175 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 176 ret <4 x float> %res 177} 178define <4 x float> @test_masked_4xfloat_dup_high_mem_mask1(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 179; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask1: 180; CHECK: # %bb.0: 181; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 182; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 183; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3] 184; CHECK-NEXT: retq 185 %vec = load <4 x float>, <4 x float>* %vp 186 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 187 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 188 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 189 ret <4 x float> %res 190} 191 192define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask1(<4 x float>* %vp, <4 x float> %mask) { 193; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask1: 194; CHECK: # %bb.0: 195; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 196; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 197; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3] 198; CHECK-NEXT: retq 199 %vec = load <4 x float>, <4 x float>* %vp 200 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 201 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 202 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 203 ret <4 x float> %res 204} 205define <4 x float> @test_masked_4xfloat_dup_high_mem_mask2(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 206; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask2: 207; CHECK: # %bb.0: 208; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 209; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 210; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3] 211; CHECK-NEXT: retq 212 %vec = load <4 x float>, <4 x float>* %vp 213 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 214 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 215 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 216 ret <4 x float> %res 217} 218 219define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask2(<4 x float>* %vp, <4 x float> %mask) { 220; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask2: 221; CHECK: # %bb.0: 222; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 223; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 224; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3] 225; CHECK-NEXT: retq 226 %vec = load <4 x float>, <4 x float>* %vp 227 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 228 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 229 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 230 ret <4 x float> %res 231} 232define <4 x float> @test_masked_4xfloat_dup_high_mem_mask3(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 233; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask3: 234; CHECK: # %bb.0: 235; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 236; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 237; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3] 238; CHECK-NEXT: retq 239 %vec = load <4 x float>, <4 x float>* %vp 240 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 241 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 242 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 243 ret <4 x float> %res 244} 245 246define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask3(<4 x float>* %vp, <4 x float> %mask) { 247; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask3: 248; CHECK: # %bb.0: 249; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 250; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 251; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3] 252; CHECK-NEXT: retq 253 %vec = load <4 x float>, <4 x float>* %vp 254 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 255 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 256 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 257 ret <4 x float> %res 258} 259define <4 x float> @test_masked_4xfloat_dup_high_mem_mask4(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 260; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask4: 261; CHECK: # %bb.0: 262; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 263; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 264; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3] 265; CHECK-NEXT: retq 266 %vec = load <4 x float>, <4 x float>* %vp 267 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 268 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 269 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 270 ret <4 x float> %res 271} 272 273define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask4(<4 x float>* %vp, <4 x float> %mask) { 274; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask4: 275; CHECK: # %bb.0: 276; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 277; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 278; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3] 279; CHECK-NEXT: retq 280 %vec = load <4 x float>, <4 x float>* %vp 281 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 282 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 283 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 284 ret <4 x float> %res 285} 286define <8 x float> @test_8xfloat_dup_high(<8 x float> %vec) { 287; CHECK-LABEL: test_8xfloat_dup_high: 288; CHECK: # %bb.0: 289; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 290; CHECK-NEXT: retq 291 %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 292 ret <8 x float> %res 293} 294define <8 x float> @test_masked_8xfloat_dup_high_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 295; CHECK-LABEL: test_masked_8xfloat_dup_high_mask0: 296; CHECK: # %bb.0: 297; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 298; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 299; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] 300; CHECK-NEXT: vmovaps %ymm1, %ymm0 301; CHECK-NEXT: retq 302 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 303 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 304 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 305 ret <8 x float> %res 306} 307 308define <8 x float> @test_masked_z_8xfloat_dup_high_mask0(<8 x float> %vec, <8 x float> %mask) { 309; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask0: 310; CHECK: # %bb.0: 311; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 312; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 313; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 314; CHECK-NEXT: retq 315 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 316 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 317 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 318 ret <8 x float> %res 319} 320define <8 x float> @test_masked_8xfloat_dup_high_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 321; CHECK-LABEL: test_masked_8xfloat_dup_high_mask1: 322; CHECK: # %bb.0: 323; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 324; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 325; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] 326; CHECK-NEXT: vmovaps %ymm1, %ymm0 327; CHECK-NEXT: retq 328 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 329 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 330 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 331 ret <8 x float> %res 332} 333 334define <8 x float> @test_masked_z_8xfloat_dup_high_mask1(<8 x float> %vec, <8 x float> %mask) { 335; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask1: 336; CHECK: # %bb.0: 337; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 338; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 339; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 340; CHECK-NEXT: retq 341 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 342 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 343 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 344 ret <8 x float> %res 345} 346define <8 x float> @test_masked_8xfloat_dup_high_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 347; CHECK-LABEL: test_masked_8xfloat_dup_high_mask2: 348; CHECK: # %bb.0: 349; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 350; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 351; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] 352; CHECK-NEXT: vmovaps %ymm1, %ymm0 353; CHECK-NEXT: retq 354 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 355 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 356 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 357 ret <8 x float> %res 358} 359 360define <8 x float> @test_masked_z_8xfloat_dup_high_mask2(<8 x float> %vec, <8 x float> %mask) { 361; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask2: 362; CHECK: # %bb.0: 363; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 364; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 365; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 366; CHECK-NEXT: retq 367 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 368 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 369 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 370 ret <8 x float> %res 371} 372define <8 x float> @test_masked_8xfloat_dup_high_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 373; CHECK-LABEL: test_masked_8xfloat_dup_high_mask3: 374; CHECK: # %bb.0: 375; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 376; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 377; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] 378; CHECK-NEXT: vmovaps %ymm1, %ymm0 379; CHECK-NEXT: retq 380 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 381 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 382 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 383 ret <8 x float> %res 384} 385 386define <8 x float> @test_masked_z_8xfloat_dup_high_mask3(<8 x float> %vec, <8 x float> %mask) { 387; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask3: 388; CHECK: # %bb.0: 389; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 390; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 391; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 392; CHECK-NEXT: retq 393 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 394 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 395 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 396 ret <8 x float> %res 397} 398define <8 x float> @test_masked_8xfloat_dup_high_mask4(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 399; CHECK-LABEL: test_masked_8xfloat_dup_high_mask4: 400; CHECK: # %bb.0: 401; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 402; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 403; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] 404; CHECK-NEXT: vmovaps %ymm1, %ymm0 405; CHECK-NEXT: retq 406 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 407 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 408 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 409 ret <8 x float> %res 410} 411 412define <8 x float> @test_masked_z_8xfloat_dup_high_mask4(<8 x float> %vec, <8 x float> %mask) { 413; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask4: 414; CHECK: # %bb.0: 415; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 416; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 417; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 418; CHECK-NEXT: retq 419 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 420 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 421 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 422 ret <8 x float> %res 423} 424define <8 x float> @test_8xfloat_dup_high_mem(<8 x float>* %vp) { 425; CHECK-LABEL: test_8xfloat_dup_high_mem: 426; CHECK: # %bb.0: 427; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] 428; CHECK-NEXT: retq 429 %vec = load <8 x float>, <8 x float>* %vp 430 %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 431 ret <8 x float> %res 432} 433define <8 x float> @test_masked_8xfloat_dup_high_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 434; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask0: 435; CHECK: # %bb.0: 436; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 437; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 438; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7] 439; CHECK-NEXT: retq 440 %vec = load <8 x float>, <8 x float>* %vp 441 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 442 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 443 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 444 ret <8 x float> %res 445} 446 447define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask0(<8 x float>* %vp, <8 x float> %mask) { 448; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask0: 449; CHECK: # %bb.0: 450; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 451; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 452; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7] 453; CHECK-NEXT: retq 454 %vec = load <8 x float>, <8 x float>* %vp 455 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 456 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 457 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 458 ret <8 x float> %res 459} 460define <8 x float> @test_masked_8xfloat_dup_high_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 461; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask1: 462; CHECK: # %bb.0: 463; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 464; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 465; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7] 466; CHECK-NEXT: retq 467 %vec = load <8 x float>, <8 x float>* %vp 468 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 469 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 470 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 471 ret <8 x float> %res 472} 473 474define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask1(<8 x float>* %vp, <8 x float> %mask) { 475; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask1: 476; CHECK: # %bb.0: 477; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 478; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 479; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7] 480; CHECK-NEXT: retq 481 %vec = load <8 x float>, <8 x float>* %vp 482 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 483 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 484 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 485 ret <8 x float> %res 486} 487define <8 x float> @test_masked_8xfloat_dup_high_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 488; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask2: 489; CHECK: # %bb.0: 490; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 491; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 492; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7] 493; CHECK-NEXT: retq 494 %vec = load <8 x float>, <8 x float>* %vp 495 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 496 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 497 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 498 ret <8 x float> %res 499} 500 501define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask2(<8 x float>* %vp, <8 x float> %mask) { 502; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask2: 503; CHECK: # %bb.0: 504; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 505; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 506; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7] 507; CHECK-NEXT: retq 508 %vec = load <8 x float>, <8 x float>* %vp 509 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 510 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 511 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 512 ret <8 x float> %res 513} 514define <8 x float> @test_masked_8xfloat_dup_high_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 515; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask3: 516; CHECK: # %bb.0: 517; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 518; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 519; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7] 520; CHECK-NEXT: retq 521 %vec = load <8 x float>, <8 x float>* %vp 522 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 523 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 524 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 525 ret <8 x float> %res 526} 527 528define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask3(<8 x float>* %vp, <8 x float> %mask) { 529; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask3: 530; CHECK: # %bb.0: 531; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 532; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 533; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7] 534; CHECK-NEXT: retq 535 %vec = load <8 x float>, <8 x float>* %vp 536 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 537 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 538 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 539 ret <8 x float> %res 540} 541define <8 x float> @test_masked_8xfloat_dup_high_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 542; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask4: 543; CHECK: # %bb.0: 544; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 545; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 546; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7] 547; CHECK-NEXT: retq 548 %vec = load <8 x float>, <8 x float>* %vp 549 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 550 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 551 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 552 ret <8 x float> %res 553} 554 555define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask4(<8 x float>* %vp, <8 x float> %mask) { 556; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask4: 557; CHECK: # %bb.0: 558; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 559; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 560; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7] 561; CHECK-NEXT: retq 562 %vec = load <8 x float>, <8 x float>* %vp 563 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 564 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 565 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 566 ret <8 x float> %res 567} 568define <16 x float> @test_16xfloat_dup_high(<16 x float> %vec) { 569; CHECK-LABEL: test_16xfloat_dup_high: 570; CHECK: # %bb.0: 571; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 572; CHECK-NEXT: retq 573 %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 574 ret <16 x float> %res 575} 576define <16 x float> @test_masked_16xfloat_dup_high_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 577; CHECK-LABEL: test_masked_16xfloat_dup_high_mask0: 578; CHECK: # %bb.0: 579; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 580; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 581; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 582; CHECK-NEXT: vmovaps %zmm1, %zmm0 583; CHECK-NEXT: retq 584 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 585 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 586 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 587 ret <16 x float> %res 588} 589 590define <16 x float> @test_masked_z_16xfloat_dup_high_mask0(<16 x float> %vec, <16 x float> %mask) { 591; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask0: 592; CHECK: # %bb.0: 593; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 594; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 595; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 596; CHECK-NEXT: retq 597 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 598 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 599 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 600 ret <16 x float> %res 601} 602define <16 x float> @test_masked_16xfloat_dup_high_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 603; CHECK-LABEL: test_masked_16xfloat_dup_high_mask1: 604; CHECK: # %bb.0: 605; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 606; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 607; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 608; CHECK-NEXT: vmovaps %zmm1, %zmm0 609; CHECK-NEXT: retq 610 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 611 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 612 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 613 ret <16 x float> %res 614} 615 616define <16 x float> @test_masked_z_16xfloat_dup_high_mask1(<16 x float> %vec, <16 x float> %mask) { 617; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask1: 618; CHECK: # %bb.0: 619; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 620; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 621; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 622; CHECK-NEXT: retq 623 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 624 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 625 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 626 ret <16 x float> %res 627} 628define <16 x float> @test_masked_16xfloat_dup_high_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 629; CHECK-LABEL: test_masked_16xfloat_dup_high_mask2: 630; CHECK: # %bb.0: 631; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 632; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 633; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 634; CHECK-NEXT: vmovaps %zmm1, %zmm0 635; CHECK-NEXT: retq 636 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 637 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 638 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 639 ret <16 x float> %res 640} 641 642define <16 x float> @test_masked_z_16xfloat_dup_high_mask2(<16 x float> %vec, <16 x float> %mask) { 643; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask2: 644; CHECK: # %bb.0: 645; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 646; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 647; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 648; CHECK-NEXT: retq 649 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 650 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 651 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 652 ret <16 x float> %res 653} 654define <16 x float> @test_masked_16xfloat_dup_high_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 655; CHECK-LABEL: test_masked_16xfloat_dup_high_mask3: 656; CHECK: # %bb.0: 657; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 658; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 659; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 660; CHECK-NEXT: vmovaps %zmm1, %zmm0 661; CHECK-NEXT: retq 662 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 663 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 664 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 665 ret <16 x float> %res 666} 667 668define <16 x float> @test_masked_z_16xfloat_dup_high_mask3(<16 x float> %vec, <16 x float> %mask) { 669; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask3: 670; CHECK: # %bb.0: 671; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 672; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 673; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 674; CHECK-NEXT: retq 675 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 676 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 677 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 678 ret <16 x float> %res 679} 680define <16 x float> @test_masked_16xfloat_dup_high_mask4(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 681; CHECK-LABEL: test_masked_16xfloat_dup_high_mask4: 682; CHECK: # %bb.0: 683; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 684; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 685; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 686; CHECK-NEXT: vmovaps %zmm1, %zmm0 687; CHECK-NEXT: retq 688 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 689 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 690 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 691 ret <16 x float> %res 692} 693 694define <16 x float> @test_masked_z_16xfloat_dup_high_mask4(<16 x float> %vec, <16 x float> %mask) { 695; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask4: 696; CHECK: # %bb.0: 697; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 698; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 699; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 700; CHECK-NEXT: retq 701 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 702 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 703 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 704 ret <16 x float> %res 705} 706define <16 x float> @test_16xfloat_dup_high_mem(<16 x float>* %vp) { 707; CHECK-LABEL: test_16xfloat_dup_high_mem: 708; CHECK: # %bb.0: 709; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 710; CHECK-NEXT: retq 711 %vec = load <16 x float>, <16 x float>* %vp 712 %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 713 ret <16 x float> %res 714} 715define <16 x float> @test_masked_16xfloat_dup_high_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 716; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask0: 717; CHECK: # %bb.0: 718; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 719; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 720; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 721; CHECK-NEXT: retq 722 %vec = load <16 x float>, <16 x float>* %vp 723 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 724 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 725 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 726 ret <16 x float> %res 727} 728 729define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask0(<16 x float>* %vp, <16 x float> %mask) { 730; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask0: 731; CHECK: # %bb.0: 732; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 733; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 734; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 735; CHECK-NEXT: retq 736 %vec = load <16 x float>, <16 x float>* %vp 737 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 738 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 739 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 740 ret <16 x float> %res 741} 742define <16 x float> @test_masked_16xfloat_dup_high_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 743; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask1: 744; CHECK: # %bb.0: 745; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 746; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 747; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 748; CHECK-NEXT: retq 749 %vec = load <16 x float>, <16 x float>* %vp 750 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 751 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 752 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 753 ret <16 x float> %res 754} 755 756define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask1(<16 x float>* %vp, <16 x float> %mask) { 757; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask1: 758; CHECK: # %bb.0: 759; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 760; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 761; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 762; CHECK-NEXT: retq 763 %vec = load <16 x float>, <16 x float>* %vp 764 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 765 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 766 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 767 ret <16 x float> %res 768} 769define <16 x float> @test_masked_16xfloat_dup_high_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 770; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask2: 771; CHECK: # %bb.0: 772; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 773; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 774; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 775; CHECK-NEXT: retq 776 %vec = load <16 x float>, <16 x float>* %vp 777 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 778 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 779 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 780 ret <16 x float> %res 781} 782 783define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask2(<16 x float>* %vp, <16 x float> %mask) { 784; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask2: 785; CHECK: # %bb.0: 786; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 787; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 788; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 789; CHECK-NEXT: retq 790 %vec = load <16 x float>, <16 x float>* %vp 791 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 792 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 793 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 794 ret <16 x float> %res 795} 796define <16 x float> @test_masked_16xfloat_dup_high_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 797; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask3: 798; CHECK: # %bb.0: 799; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 800; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 801; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 802; CHECK-NEXT: retq 803 %vec = load <16 x float>, <16 x float>* %vp 804 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 805 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 806 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 807 ret <16 x float> %res 808} 809 810define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask3(<16 x float>* %vp, <16 x float> %mask) { 811; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask3: 812; CHECK: # %bb.0: 813; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 814; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 815; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 816; CHECK-NEXT: retq 817 %vec = load <16 x float>, <16 x float>* %vp 818 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 819 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 820 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 821 ret <16 x float> %res 822} 823define <16 x float> @test_masked_16xfloat_dup_high_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 824; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask4: 825; CHECK: # %bb.0: 826; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 827; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 828; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 829; CHECK-NEXT: retq 830 %vec = load <16 x float>, <16 x float>* %vp 831 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 832 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 833 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 834 ret <16 x float> %res 835} 836 837define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask4(<16 x float>* %vp, <16 x float> %mask) { 838; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask4: 839; CHECK: # %bb.0: 840; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 841; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 842; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 843; CHECK-NEXT: retq 844 %vec = load <16 x float>, <16 x float>* %vp 845 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 846 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 847 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 848 ret <16 x float> %res 849} 850