1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s 3 4define <2 x double> @test_2xdouble_dup_low(<2 x double> %vec) { 5; CHECK-LABEL: test_2xdouble_dup_low: 6; CHECK: # %bb.0: 7; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 8; CHECK-NEXT: retq 9 %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 10 ret <2 x double> %res 11} 12define <2 x double> @test_masked_2xdouble_dup_low_mask0(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 13; CHECK-LABEL: test_masked_2xdouble_dup_low_mask0: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 16; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 17; CHECK-NEXT: vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0] 18; CHECK-NEXT: vmovapd %xmm1, %xmm0 19; CHECK-NEXT: retq 20 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 21 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 22 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 23 ret <2 x double> %res 24} 25 26define <2 x double> @test_masked_z_2xdouble_dup_low_mask0(<2 x double> %vec, <2 x double> %mask) { 27; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask0: 28; CHECK: # %bb.0: 29; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 30; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 31; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 32; CHECK-NEXT: retq 33 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 34 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 35 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 36 ret <2 x double> %res 37} 38define <2 x double> @test_masked_2xdouble_dup_low_mask1(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 39; CHECK-LABEL: test_masked_2xdouble_dup_low_mask1: 40; CHECK: # %bb.0: 41; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 42; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 43; CHECK-NEXT: vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0] 44; CHECK-NEXT: vmovapd %xmm1, %xmm0 45; CHECK-NEXT: retq 46 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 47 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 48 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 49 ret <2 x double> %res 50} 51 52define <2 x double> @test_masked_z_2xdouble_dup_low_mask1(<2 x double> %vec, <2 x double> %mask) { 53; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask1: 54; CHECK: # %bb.0: 55; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 56; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 57; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 58; CHECK-NEXT: retq 59 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 60 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 61 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 62 ret <2 x double> %res 63} 64define <2 x double> @test_2xdouble_dup_low_mem(<2 x double>* %vp) { 65; CHECK-LABEL: test_2xdouble_dup_low_mem: 66; CHECK: # %bb.0: 67; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 68; CHECK-NEXT: retq 69 %vec = load <2 x double>, <2 x double>* %vp 70 %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 71 ret <2 x double> %res 72} 73define <2 x double> @test_masked_2xdouble_dup_low_mem_mask0(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { 74; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask0: 75; CHECK: # %bb.0: 76; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 77; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 78; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0] 79; CHECK-NEXT: retq 80 %vec = load <2 x double>, <2 x double>* %vp 81 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 82 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 83 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 84 ret <2 x double> %res 85} 86 87define <2 x double> @test_masked_z_2xdouble_dup_low_mem_mask0(<2 x double>* %vp, <2 x double> %mask) { 88; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask0: 89; CHECK: # %bb.0: 90; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 91; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 92; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0] 93; CHECK-NEXT: retq 94 %vec = load <2 x double>, <2 x double>* %vp 95 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 96 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 97 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 98 ret <2 x double> %res 99} 100define <2 x double> @test_masked_2xdouble_dup_low_mem_mask1(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { 101; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask1: 102; CHECK: # %bb.0: 103; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 104; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 105; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0] 106; CHECK-NEXT: retq 107 %vec = load <2 x double>, <2 x double>* %vp 108 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 109 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 110 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 111 ret <2 x double> %res 112} 113 114define <2 x double> @test_masked_z_2xdouble_dup_low_mem_mask1(<2 x double>* %vp, <2 x double> %mask) { 115; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask1: 116; CHECK: # %bb.0: 117; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 118; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 119; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0] 120; CHECK-NEXT: retq 121 %vec = load <2 x double>, <2 x double>* %vp 122 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0> 123 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 124 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 125 ret <2 x double> %res 126} 127define <4 x double> @test_4xdouble_dup_low(<4 x double> %vec) { 128; CHECK-LABEL: test_4xdouble_dup_low: 129; CHECK: # %bb.0: 130; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 131; CHECK-NEXT: retq 132 %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 133 ret <4 x double> %res 134} 135define <4 x double> @test_masked_4xdouble_dup_low_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 136; CHECK-LABEL: test_masked_4xdouble_dup_low_mask0: 137; CHECK: # %bb.0: 138; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 139; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 140; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2] 141; CHECK-NEXT: vmovapd %ymm1, %ymm0 142; CHECK-NEXT: retq 143 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 144 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 145 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 146 ret <4 x double> %res 147} 148 149define <4 x double> @test_masked_z_4xdouble_dup_low_mask0(<4 x double> %vec, <4 x double> %mask) { 150; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask0: 151; CHECK: # %bb.0: 152; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 153; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 154; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 155; CHECK-NEXT: retq 156 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 157 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 158 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 159 ret <4 x double> %res 160} 161define <4 x double> @test_masked_4xdouble_dup_low_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 162; CHECK-LABEL: test_masked_4xdouble_dup_low_mask1: 163; CHECK: # %bb.0: 164; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 165; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 166; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2] 167; CHECK-NEXT: vmovapd %ymm1, %ymm0 168; CHECK-NEXT: retq 169 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 170 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 171 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 172 ret <4 x double> %res 173} 174 175define <4 x double> @test_masked_z_4xdouble_dup_low_mask1(<4 x double> %vec, <4 x double> %mask) { 176; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask1: 177; CHECK: # %bb.0: 178; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 179; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 180; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 181; CHECK-NEXT: retq 182 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 183 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 184 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 185 ret <4 x double> %res 186} 187define <4 x double> @test_masked_4xdouble_dup_low_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 188; CHECK-LABEL: test_masked_4xdouble_dup_low_mask2: 189; CHECK: # %bb.0: 190; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 191; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 192; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2] 193; CHECK-NEXT: vmovapd %ymm1, %ymm0 194; CHECK-NEXT: retq 195 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 196 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 197 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 198 ret <4 x double> %res 199} 200 201define <4 x double> @test_masked_z_4xdouble_dup_low_mask2(<4 x double> %vec, <4 x double> %mask) { 202; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask2: 203; CHECK: # %bb.0: 204; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 205; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 206; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 207; CHECK-NEXT: retq 208 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 209 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 210 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 211 ret <4 x double> %res 212} 213define <4 x double> @test_masked_4xdouble_dup_low_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 214; CHECK-LABEL: test_masked_4xdouble_dup_low_mask3: 215; CHECK: # %bb.0: 216; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 217; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 218; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2] 219; CHECK-NEXT: vmovapd %ymm1, %ymm0 220; CHECK-NEXT: retq 221 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 222 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 223 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 224 ret <4 x double> %res 225} 226 227define <4 x double> @test_masked_z_4xdouble_dup_low_mask3(<4 x double> %vec, <4 x double> %mask) { 228; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask3: 229; CHECK: # %bb.0: 230; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 231; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 232; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 233; CHECK-NEXT: retq 234 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 235 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 236 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 237 ret <4 x double> %res 238} 239define <4 x double> @test_masked_4xdouble_dup_low_mask4(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 240; CHECK-LABEL: test_masked_4xdouble_dup_low_mask4: 241; CHECK: # %bb.0: 242; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 243; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 244; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2] 245; CHECK-NEXT: vmovapd %ymm1, %ymm0 246; CHECK-NEXT: retq 247 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 248 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 249 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 250 ret <4 x double> %res 251} 252 253define <4 x double> @test_masked_z_4xdouble_dup_low_mask4(<4 x double> %vec, <4 x double> %mask) { 254; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask4: 255; CHECK: # %bb.0: 256; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 257; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 258; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 259; CHECK-NEXT: retq 260 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 261 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 262 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 263 ret <4 x double> %res 264} 265define <4 x double> @test_4xdouble_dup_low_mem(<4 x double>* %vp) { 266; CHECK-LABEL: test_4xdouble_dup_low_mem: 267; CHECK: # %bb.0: 268; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] 269; CHECK-NEXT: retq 270 %vec = load <4 x double>, <4 x double>* %vp 271 %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 272 ret <4 x double> %res 273} 274define <4 x double> @test_masked_4xdouble_dup_low_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 275; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask0: 276; CHECK: # %bb.0: 277; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 278; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 279; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2] 280; CHECK-NEXT: retq 281 %vec = load <4 x double>, <4 x double>* %vp 282 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 283 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 284 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 285 ret <4 x double> %res 286} 287 288define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask0(<4 x double>* %vp, <4 x double> %mask) { 289; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask0: 290; CHECK: # %bb.0: 291; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 292; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 293; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2] 294; CHECK-NEXT: retq 295 %vec = load <4 x double>, <4 x double>* %vp 296 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 297 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 298 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 299 ret <4 x double> %res 300} 301define <4 x double> @test_masked_4xdouble_dup_low_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 302; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask1: 303; CHECK: # %bb.0: 304; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 305; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 306; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2] 307; CHECK-NEXT: retq 308 %vec = load <4 x double>, <4 x double>* %vp 309 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 310 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 311 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 312 ret <4 x double> %res 313} 314 315define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask1(<4 x double>* %vp, <4 x double> %mask) { 316; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask1: 317; CHECK: # %bb.0: 318; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 319; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 320; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2] 321; CHECK-NEXT: retq 322 %vec = load <4 x double>, <4 x double>* %vp 323 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 324 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 325 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 326 ret <4 x double> %res 327} 328define <4 x double> @test_masked_4xdouble_dup_low_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 329; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask2: 330; CHECK: # %bb.0: 331; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 332; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 333; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2] 334; CHECK-NEXT: retq 335 %vec = load <4 x double>, <4 x double>* %vp 336 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 337 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 338 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 339 ret <4 x double> %res 340} 341 342define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask2(<4 x double>* %vp, <4 x double> %mask) { 343; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask2: 344; CHECK: # %bb.0: 345; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 346; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 347; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2] 348; CHECK-NEXT: retq 349 %vec = load <4 x double>, <4 x double>* %vp 350 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 351 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 352 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 353 ret <4 x double> %res 354} 355define <4 x double> @test_masked_4xdouble_dup_low_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 356; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask3: 357; CHECK: # %bb.0: 358; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 359; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 360; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2] 361; CHECK-NEXT: retq 362 %vec = load <4 x double>, <4 x double>* %vp 363 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 364 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 365 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 366 ret <4 x double> %res 367} 368 369define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask3(<4 x double>* %vp, <4 x double> %mask) { 370; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask3: 371; CHECK: # %bb.0: 372; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 373; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 374; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2] 375; CHECK-NEXT: retq 376 %vec = load <4 x double>, <4 x double>* %vp 377 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 378 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 379 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 380 ret <4 x double> %res 381} 382define <4 x double> @test_masked_4xdouble_dup_low_mem_mask4(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { 383; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask4: 384; CHECK: # %bb.0: 385; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 386; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 387; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2] 388; CHECK-NEXT: retq 389 %vec = load <4 x double>, <4 x double>* %vp 390 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 391 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 392 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 393 ret <4 x double> %res 394} 395 396define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask4(<4 x double>* %vp, <4 x double> %mask) { 397; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask4: 398; CHECK: # %bb.0: 399; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 400; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 401; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2] 402; CHECK-NEXT: retq 403 %vec = load <4 x double>, <4 x double>* %vp 404 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 405 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 406 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 407 ret <4 x double> %res 408} 409define <8 x double> @test_8xdouble_dup_low(<8 x double> %vec) { 410; CHECK-LABEL: test_8xdouble_dup_low: 411; CHECK: # %bb.0: 412; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] 413; CHECK-NEXT: retq 414 %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 415 ret <8 x double> %res 416} 417define <8 x double> @test_masked_8xdouble_dup_low_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { 418; CHECK-LABEL: test_masked_8xdouble_dup_low_mask0: 419; CHECK: # %bb.0: 420; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 421; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 422; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] 423; CHECK-NEXT: vmovapd %zmm1, %zmm0 424; CHECK-NEXT: retq 425 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 426 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 427 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 428 ret <8 x double> %res 429} 430 431define <8 x double> @test_masked_z_8xdouble_dup_low_mask0(<8 x double> %vec, <8 x double> %mask) { 432; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask0: 433; CHECK: # %bb.0: 434; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 435; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 436; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 437; CHECK-NEXT: retq 438 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 439 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 440 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 441 ret <8 x double> %res 442} 443define <8 x double> @test_masked_8xdouble_dup_low_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { 444; CHECK-LABEL: test_masked_8xdouble_dup_low_mask1: 445; CHECK: # %bb.0: 446; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 447; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 448; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] 449; CHECK-NEXT: vmovapd %zmm1, %zmm0 450; CHECK-NEXT: retq 451 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 452 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 453 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 454 ret <8 x double> %res 455} 456 457define <8 x double> @test_masked_z_8xdouble_dup_low_mask1(<8 x double> %vec, <8 x double> %mask) { 458; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask1: 459; CHECK: # %bb.0: 460; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 461; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 462; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 463; CHECK-NEXT: retq 464 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 465 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 466 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 467 ret <8 x double> %res 468} 469define <8 x double> @test_masked_8xdouble_dup_low_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { 470; CHECK-LABEL: test_masked_8xdouble_dup_low_mask2: 471; CHECK: # %bb.0: 472; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 473; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 474; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] 475; CHECK-NEXT: vmovapd %zmm1, %zmm0 476; CHECK-NEXT: retq 477 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 478 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 479 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 480 ret <8 x double> %res 481} 482 483define <8 x double> @test_masked_z_8xdouble_dup_low_mask2(<8 x double> %vec, <8 x double> %mask) { 484; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask2: 485; CHECK: # %bb.0: 486; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 487; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 488; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 489; CHECK-NEXT: retq 490 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 491 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 492 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 493 ret <8 x double> %res 494} 495define <8 x double> @test_masked_8xdouble_dup_low_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { 496; CHECK-LABEL: test_masked_8xdouble_dup_low_mask3: 497; CHECK: # %bb.0: 498; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 499; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 500; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] 501; CHECK-NEXT: vmovapd %zmm1, %zmm0 502; CHECK-NEXT: retq 503 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 504 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 505 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 506 ret <8 x double> %res 507} 508 509define <8 x double> @test_masked_z_8xdouble_dup_low_mask3(<8 x double> %vec, <8 x double> %mask) { 510; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask3: 511; CHECK: # %bb.0: 512; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 513; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 514; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 515; CHECK-NEXT: retq 516 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 517 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 518 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 519 ret <8 x double> %res 520} 521define <8 x double> @test_masked_8xdouble_dup_low_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { 522; CHECK-LABEL: test_masked_8xdouble_dup_low_mask4: 523; CHECK: # %bb.0: 524; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 525; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 526; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] 527; CHECK-NEXT: vmovapd %zmm1, %zmm0 528; CHECK-NEXT: retq 529 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 530 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 531 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 532 ret <8 x double> %res 533} 534 535define <8 x double> @test_masked_z_8xdouble_dup_low_mask4(<8 x double> %vec, <8 x double> %mask) { 536; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask4: 537; CHECK: # %bb.0: 538; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 539; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 540; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 541; CHECK-NEXT: retq 542 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 543 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 544 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 545 ret <8 x double> %res 546} 547define <8 x double> @test_8xdouble_dup_low_mem(<8 x double>* %vp) { 548; CHECK-LABEL: test_8xdouble_dup_low_mem: 549; CHECK: # %bb.0: 550; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] 551; CHECK-NEXT: retq 552 %vec = load <8 x double>, <8 x double>* %vp 553 %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 554 ret <8 x double> %res 555} 556define <8 x double> @test_masked_8xdouble_dup_low_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) { 557; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask0: 558; CHECK: # %bb.0: 559; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 560; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 561; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6] 562; CHECK-NEXT: retq 563 %vec = load <8 x double>, <8 x double>* %vp 564 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 565 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 566 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 567 ret <8 x double> %res 568} 569 570define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask0(<8 x double>* %vp, <8 x double> %mask) { 571; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask0: 572; CHECK: # %bb.0: 573; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 574; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 575; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 576; CHECK-NEXT: retq 577 %vec = load <8 x double>, <8 x double>* %vp 578 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 579 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 580 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 581 ret <8 x double> %res 582} 583define <8 x double> @test_masked_8xdouble_dup_low_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) { 584; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask1: 585; CHECK: # %bb.0: 586; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 587; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 588; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6] 589; CHECK-NEXT: retq 590 %vec = load <8 x double>, <8 x double>* %vp 591 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 592 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 593 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 594 ret <8 x double> %res 595} 596 597define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask1(<8 x double>* %vp, <8 x double> %mask) { 598; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask1: 599; CHECK: # %bb.0: 600; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 601; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 602; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 603; CHECK-NEXT: retq 604 %vec = load <8 x double>, <8 x double>* %vp 605 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 606 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 607 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 608 ret <8 x double> %res 609} 610define <8 x double> @test_masked_8xdouble_dup_low_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) { 611; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask2: 612; CHECK: # %bb.0: 613; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 614; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 615; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6] 616; CHECK-NEXT: retq 617 %vec = load <8 x double>, <8 x double>* %vp 618 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 619 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 620 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 621 ret <8 x double> %res 622} 623 624define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask2(<8 x double>* %vp, <8 x double> %mask) { 625; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask2: 626; CHECK: # %bb.0: 627; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 628; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 629; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 630; CHECK-NEXT: retq 631 %vec = load <8 x double>, <8 x double>* %vp 632 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 633 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 634 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 635 ret <8 x double> %res 636} 637define <8 x double> @test_masked_8xdouble_dup_low_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) { 638; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask3: 639; CHECK: # %bb.0: 640; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 641; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 642; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6] 643; CHECK-NEXT: retq 644 %vec = load <8 x double>, <8 x double>* %vp 645 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 646 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 647 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 648 ret <8 x double> %res 649} 650 651define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask3(<8 x double>* %vp, <8 x double> %mask) { 652; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask3: 653; CHECK: # %bb.0: 654; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 655; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 656; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 657; CHECK-NEXT: retq 658 %vec = load <8 x double>, <8 x double>* %vp 659 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 660 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 661 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 662 ret <8 x double> %res 663} 664define <8 x double> @test_masked_8xdouble_dup_low_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) { 665; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask4: 666; CHECK: # %bb.0: 667; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 668; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 669; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6] 670; CHECK-NEXT: retq 671 %vec = load <8 x double>, <8 x double>* %vp 672 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 673 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 674 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 675 ret <8 x double> %res 676} 677 678define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask4(<8 x double>* %vp, <8 x double> %mask) { 679; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask4: 680; CHECK: # %bb.0: 681; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 682; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 683; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 684; CHECK-NEXT: retq 685 %vec = load <8 x double>, <8 x double>* %vp 686 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 687 %cmp = fcmp oeq <8 x double> %mask, zeroinitializer 688 %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer 689 ret <8 x double> %res 690} 691define <4 x float> @test_4xfloat_dup_low(<4 x float> %vec) { 692; CHECK-LABEL: test_4xfloat_dup_low: 693; CHECK: # %bb.0: 694; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 695; CHECK-NEXT: retq 696 %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 697 ret <4 x float> %res 698} 699define <4 x float> @test_masked_4xfloat_dup_low_mask0(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 700; CHECK-LABEL: test_masked_4xfloat_dup_low_mask0: 701; CHECK: # %bb.0: 702; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 703; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 704; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2] 705; CHECK-NEXT: vmovaps %xmm1, %xmm0 706; CHECK-NEXT: retq 707 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 708 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 709 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 710 ret <4 x float> %res 711} 712 713define <4 x float> @test_masked_z_4xfloat_dup_low_mask0(<4 x float> %vec, <4 x float> %mask) { 714; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask0: 715; CHECK: # %bb.0: 716; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 717; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 718; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 719; CHECK-NEXT: retq 720 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 721 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 722 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 723 ret <4 x float> %res 724} 725define <4 x float> @test_masked_4xfloat_dup_low_mask1(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 726; CHECK-LABEL: test_masked_4xfloat_dup_low_mask1: 727; CHECK: # %bb.0: 728; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 729; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 730; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2] 731; CHECK-NEXT: vmovaps %xmm1, %xmm0 732; CHECK-NEXT: retq 733 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 734 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 735 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 736 ret <4 x float> %res 737} 738 739define <4 x float> @test_masked_z_4xfloat_dup_low_mask1(<4 x float> %vec, <4 x float> %mask) { 740; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask1: 741; CHECK: # %bb.0: 742; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 743; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 744; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 745; CHECK-NEXT: retq 746 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 747 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 748 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 749 ret <4 x float> %res 750} 751define <4 x float> @test_masked_4xfloat_dup_low_mask2(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 752; CHECK-LABEL: test_masked_4xfloat_dup_low_mask2: 753; CHECK: # %bb.0: 754; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 755; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 756; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2] 757; CHECK-NEXT: vmovaps %xmm1, %xmm0 758; CHECK-NEXT: retq 759 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 760 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 761 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 762 ret <4 x float> %res 763} 764 765define <4 x float> @test_masked_z_4xfloat_dup_low_mask2(<4 x float> %vec, <4 x float> %mask) { 766; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask2: 767; CHECK: # %bb.0: 768; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 769; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 770; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 771; CHECK-NEXT: retq 772 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 773 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 774 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 775 ret <4 x float> %res 776} 777define <4 x float> @test_masked_4xfloat_dup_low_mask3(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 778; CHECK-LABEL: test_masked_4xfloat_dup_low_mask3: 779; CHECK: # %bb.0: 780; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 781; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 782; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2] 783; CHECK-NEXT: vmovaps %xmm1, %xmm0 784; CHECK-NEXT: retq 785 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 786 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 787 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 788 ret <4 x float> %res 789} 790 791define <4 x float> @test_masked_z_4xfloat_dup_low_mask3(<4 x float> %vec, <4 x float> %mask) { 792; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask3: 793; CHECK: # %bb.0: 794; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 795; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 796; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 797; CHECK-NEXT: retq 798 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 799 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 800 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 801 ret <4 x float> %res 802} 803define <4 x float> @test_masked_4xfloat_dup_low_mask4(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 804; CHECK-LABEL: test_masked_4xfloat_dup_low_mask4: 805; CHECK: # %bb.0: 806; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 807; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 808; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2] 809; CHECK-NEXT: vmovaps %xmm1, %xmm0 810; CHECK-NEXT: retq 811 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 812 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 813 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 814 ret <4 x float> %res 815} 816 817define <4 x float> @test_masked_z_4xfloat_dup_low_mask4(<4 x float> %vec, <4 x float> %mask) { 818; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask4: 819; CHECK: # %bb.0: 820; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 821; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 822; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 823; CHECK-NEXT: retq 824 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 825 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 826 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 827 ret <4 x float> %res 828} 829define <4 x float> @test_4xfloat_dup_low_mem(<4 x float>* %vp) { 830; CHECK-LABEL: test_4xfloat_dup_low_mem: 831; CHECK: # %bb.0: 832; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = mem[0,0,2,2] 833; CHECK-NEXT: retq 834 %vec = load <4 x float>, <4 x float>* %vp 835 %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 836 ret <4 x float> %res 837} 838define <4 x float> @test_masked_4xfloat_dup_low_mem_mask0(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 839; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask0: 840; CHECK: # %bb.0: 841; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 842; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 843; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2] 844; CHECK-NEXT: retq 845 %vec = load <4 x float>, <4 x float>* %vp 846 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 847 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 848 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 849 ret <4 x float> %res 850} 851 852define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask0(<4 x float>* %vp, <4 x float> %mask) { 853; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask0: 854; CHECK: # %bb.0: 855; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 856; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 857; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2] 858; CHECK-NEXT: retq 859 %vec = load <4 x float>, <4 x float>* %vp 860 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 861 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 862 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 863 ret <4 x float> %res 864} 865define <4 x float> @test_masked_4xfloat_dup_low_mem_mask1(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 866; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask1: 867; CHECK: # %bb.0: 868; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 869; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 870; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2] 871; CHECK-NEXT: retq 872 %vec = load <4 x float>, <4 x float>* %vp 873 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 874 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 875 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 876 ret <4 x float> %res 877} 878 879define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask1(<4 x float>* %vp, <4 x float> %mask) { 880; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask1: 881; CHECK: # %bb.0: 882; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 883; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 884; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2] 885; CHECK-NEXT: retq 886 %vec = load <4 x float>, <4 x float>* %vp 887 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 888 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 889 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 890 ret <4 x float> %res 891} 892define <4 x float> @test_masked_4xfloat_dup_low_mem_mask2(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 893; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask2: 894; CHECK: # %bb.0: 895; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 896; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 897; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2] 898; CHECK-NEXT: retq 899 %vec = load <4 x float>, <4 x float>* %vp 900 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 901 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 902 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 903 ret <4 x float> %res 904} 905 906define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask2(<4 x float>* %vp, <4 x float> %mask) { 907; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask2: 908; CHECK: # %bb.0: 909; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 910; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 911; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2] 912; CHECK-NEXT: retq 913 %vec = load <4 x float>, <4 x float>* %vp 914 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 915 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 916 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 917 ret <4 x float> %res 918} 919define <4 x float> @test_masked_4xfloat_dup_low_mem_mask3(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 920; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask3: 921; CHECK: # %bb.0: 922; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 923; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 924; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2] 925; CHECK-NEXT: retq 926 %vec = load <4 x float>, <4 x float>* %vp 927 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 928 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 929 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 930 ret <4 x float> %res 931} 932 933define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask3(<4 x float>* %vp, <4 x float> %mask) { 934; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask3: 935; CHECK: # %bb.0: 936; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 937; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 938; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2] 939; CHECK-NEXT: retq 940 %vec = load <4 x float>, <4 x float>* %vp 941 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 942 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 943 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 944 ret <4 x float> %res 945} 946define <4 x float> @test_masked_4xfloat_dup_low_mem_mask4(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { 947; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask4: 948; CHECK: # %bb.0: 949; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 950; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 951; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2] 952; CHECK-NEXT: retq 953 %vec = load <4 x float>, <4 x float>* %vp 954 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 955 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 956 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 957 ret <4 x float> %res 958} 959 960define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask4(<4 x float>* %vp, <4 x float> %mask) { 961; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask4: 962; CHECK: # %bb.0: 963; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 964; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 965; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2] 966; CHECK-NEXT: retq 967 %vec = load <4 x float>, <4 x float>* %vp 968 %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 969 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 970 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 971 ret <4 x float> %res 972} 973define <8 x float> @test_8xfloat_dup_low(<8 x float> %vec) { 974; CHECK-LABEL: test_8xfloat_dup_low: 975; CHECK: # %bb.0: 976; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 977; CHECK-NEXT: retq 978 %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 979 ret <8 x float> %res 980} 981define <8 x float> @test_masked_8xfloat_dup_low_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 982; CHECK-LABEL: test_masked_8xfloat_dup_low_mask0: 983; CHECK: # %bb.0: 984; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 985; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 986; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] 987; CHECK-NEXT: vmovaps %ymm1, %ymm0 988; CHECK-NEXT: retq 989 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 990 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 991 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 992 ret <8 x float> %res 993} 994 995define <8 x float> @test_masked_z_8xfloat_dup_low_mask0(<8 x float> %vec, <8 x float> %mask) { 996; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask0: 997; CHECK: # %bb.0: 998; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 999; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1000; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 1001; CHECK-NEXT: retq 1002 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1003 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1004 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1005 ret <8 x float> %res 1006} 1007define <8 x float> @test_masked_8xfloat_dup_low_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 1008; CHECK-LABEL: test_masked_8xfloat_dup_low_mask1: 1009; CHECK: # %bb.0: 1010; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1011; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 1012; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] 1013; CHECK-NEXT: vmovaps %ymm1, %ymm0 1014; CHECK-NEXT: retq 1015 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1016 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1017 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1018 ret <8 x float> %res 1019} 1020 1021define <8 x float> @test_masked_z_8xfloat_dup_low_mask1(<8 x float> %vec, <8 x float> %mask) { 1022; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask1: 1023; CHECK: # %bb.0: 1024; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1025; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1026; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 1027; CHECK-NEXT: retq 1028 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1029 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1030 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1031 ret <8 x float> %res 1032} 1033define <8 x float> @test_masked_8xfloat_dup_low_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 1034; CHECK-LABEL: test_masked_8xfloat_dup_low_mask2: 1035; CHECK: # %bb.0: 1036; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1037; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 1038; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] 1039; CHECK-NEXT: vmovaps %ymm1, %ymm0 1040; CHECK-NEXT: retq 1041 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1042 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1043 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1044 ret <8 x float> %res 1045} 1046 1047define <8 x float> @test_masked_z_8xfloat_dup_low_mask2(<8 x float> %vec, <8 x float> %mask) { 1048; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask2: 1049; CHECK: # %bb.0: 1050; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1051; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1052; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 1053; CHECK-NEXT: retq 1054 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1055 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1056 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1057 ret <8 x float> %res 1058} 1059define <8 x float> @test_masked_8xfloat_dup_low_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 1060; CHECK-LABEL: test_masked_8xfloat_dup_low_mask3: 1061; CHECK: # %bb.0: 1062; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1063; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 1064; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] 1065; CHECK-NEXT: vmovaps %ymm1, %ymm0 1066; CHECK-NEXT: retq 1067 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1068 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1069 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1070 ret <8 x float> %res 1071} 1072 1073define <8 x float> @test_masked_z_8xfloat_dup_low_mask3(<8 x float> %vec, <8 x float> %mask) { 1074; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask3: 1075; CHECK: # %bb.0: 1076; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1077; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1078; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 1079; CHECK-NEXT: retq 1080 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1081 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1082 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1083 ret <8 x float> %res 1084} 1085define <8 x float> @test_masked_8xfloat_dup_low_mask4(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 1086; CHECK-LABEL: test_masked_8xfloat_dup_low_mask4: 1087; CHECK: # %bb.0: 1088; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1089; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 1090; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] 1091; CHECK-NEXT: vmovaps %ymm1, %ymm0 1092; CHECK-NEXT: retq 1093 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1094 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1095 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1096 ret <8 x float> %res 1097} 1098 1099define <8 x float> @test_masked_z_8xfloat_dup_low_mask4(<8 x float> %vec, <8 x float> %mask) { 1100; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask4: 1101; CHECK: # %bb.0: 1102; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1103; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1104; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 1105; CHECK-NEXT: retq 1106 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1107 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1108 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1109 ret <8 x float> %res 1110} 1111define <8 x float> @test_8xfloat_dup_low_mem(<8 x float>* %vp) { 1112; CHECK-LABEL: test_8xfloat_dup_low_mem: 1113; CHECK: # %bb.0: 1114; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = mem[0,0,2,2,4,4,6,6] 1115; CHECK-NEXT: retq 1116 %vec = load <8 x float>, <8 x float>* %vp 1117 %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1118 ret <8 x float> %res 1119} 1120define <8 x float> @test_masked_8xfloat_dup_low_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 1121; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask0: 1122; CHECK: # %bb.0: 1123; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1124; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1125; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6] 1126; CHECK-NEXT: retq 1127 %vec = load <8 x float>, <8 x float>* %vp 1128 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1129 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1130 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1131 ret <8 x float> %res 1132} 1133 1134define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask0(<8 x float>* %vp, <8 x float> %mask) { 1135; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask0: 1136; CHECK: # %bb.0: 1137; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1138; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 1139; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 1140; CHECK-NEXT: retq 1141 %vec = load <8 x float>, <8 x float>* %vp 1142 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1143 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1144 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1145 ret <8 x float> %res 1146} 1147define <8 x float> @test_masked_8xfloat_dup_low_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 1148; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask1: 1149; CHECK: # %bb.0: 1150; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1151; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1152; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6] 1153; CHECK-NEXT: retq 1154 %vec = load <8 x float>, <8 x float>* %vp 1155 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1156 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1157 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1158 ret <8 x float> %res 1159} 1160 1161define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask1(<8 x float>* %vp, <8 x float> %mask) { 1162; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask1: 1163; CHECK: # %bb.0: 1164; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1165; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 1166; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 1167; CHECK-NEXT: retq 1168 %vec = load <8 x float>, <8 x float>* %vp 1169 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1170 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1171 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1172 ret <8 x float> %res 1173} 1174define <8 x float> @test_masked_8xfloat_dup_low_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 1175; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask2: 1176; CHECK: # %bb.0: 1177; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1178; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1179; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6] 1180; CHECK-NEXT: retq 1181 %vec = load <8 x float>, <8 x float>* %vp 1182 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1183 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1184 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1185 ret <8 x float> %res 1186} 1187 1188define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask2(<8 x float>* %vp, <8 x float> %mask) { 1189; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask2: 1190; CHECK: # %bb.0: 1191; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1192; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 1193; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 1194; CHECK-NEXT: retq 1195 %vec = load <8 x float>, <8 x float>* %vp 1196 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1197 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1198 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1199 ret <8 x float> %res 1200} 1201define <8 x float> @test_masked_8xfloat_dup_low_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 1202; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask3: 1203; CHECK: # %bb.0: 1204; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1205; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1206; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6] 1207; CHECK-NEXT: retq 1208 %vec = load <8 x float>, <8 x float>* %vp 1209 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1210 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1211 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1212 ret <8 x float> %res 1213} 1214 1215define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask3(<8 x float>* %vp, <8 x float> %mask) { 1216; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask3: 1217; CHECK: # %bb.0: 1218; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1219; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 1220; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 1221; CHECK-NEXT: retq 1222 %vec = load <8 x float>, <8 x float>* %vp 1223 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1224 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1225 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1226 ret <8 x float> %res 1227} 1228define <8 x float> @test_masked_8xfloat_dup_low_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { 1229; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask4: 1230; CHECK: # %bb.0: 1231; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1232; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 1233; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6] 1234; CHECK-NEXT: retq 1235 %vec = load <8 x float>, <8 x float>* %vp 1236 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1237 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1238 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 1239 ret <8 x float> %res 1240} 1241 1242define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask4(<8 x float>* %vp, <8 x float> %mask) { 1243; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask4: 1244; CHECK: # %bb.0: 1245; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1246; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1 1247; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6] 1248; CHECK-NEXT: retq 1249 %vec = load <8 x float>, <8 x float>* %vp 1250 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1251 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 1252 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 1253 ret <8 x float> %res 1254} 1255define <16 x float> @test_16xfloat_dup_low(<16 x float> %vec) { 1256; CHECK-LABEL: test_16xfloat_dup_low: 1257; CHECK: # %bb.0: 1258; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1259; CHECK-NEXT: retq 1260 %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1261 ret <16 x float> %res 1262} 1263define <16 x float> @test_masked_16xfloat_dup_low_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 1264; CHECK-LABEL: test_masked_16xfloat_dup_low_mask0: 1265; CHECK: # %bb.0: 1266; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1267; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 1268; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1269; CHECK-NEXT: vmovaps %zmm1, %zmm0 1270; CHECK-NEXT: retq 1271 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1272 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1273 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1274 ret <16 x float> %res 1275} 1276 1277define <16 x float> @test_masked_z_16xfloat_dup_low_mask0(<16 x float> %vec, <16 x float> %mask) { 1278; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask0: 1279; CHECK: # %bb.0: 1280; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1281; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1282; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1283; CHECK-NEXT: retq 1284 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1285 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1286 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1287 ret <16 x float> %res 1288} 1289define <16 x float> @test_masked_16xfloat_dup_low_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 1290; CHECK-LABEL: test_masked_16xfloat_dup_low_mask1: 1291; CHECK: # %bb.0: 1292; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1293; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 1294; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1295; CHECK-NEXT: vmovaps %zmm1, %zmm0 1296; CHECK-NEXT: retq 1297 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1298 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1299 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1300 ret <16 x float> %res 1301} 1302 1303define <16 x float> @test_masked_z_16xfloat_dup_low_mask1(<16 x float> %vec, <16 x float> %mask) { 1304; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask1: 1305; CHECK: # %bb.0: 1306; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1307; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1308; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1309; CHECK-NEXT: retq 1310 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1311 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1312 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1313 ret <16 x float> %res 1314} 1315define <16 x float> @test_masked_16xfloat_dup_low_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 1316; CHECK-LABEL: test_masked_16xfloat_dup_low_mask2: 1317; CHECK: # %bb.0: 1318; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1319; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 1320; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1321; CHECK-NEXT: vmovaps %zmm1, %zmm0 1322; CHECK-NEXT: retq 1323 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1324 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1325 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1326 ret <16 x float> %res 1327} 1328 1329define <16 x float> @test_masked_z_16xfloat_dup_low_mask2(<16 x float> %vec, <16 x float> %mask) { 1330; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask2: 1331; CHECK: # %bb.0: 1332; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1333; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1334; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1335; CHECK-NEXT: retq 1336 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1337 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1338 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1339 ret <16 x float> %res 1340} 1341define <16 x float> @test_masked_16xfloat_dup_low_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 1342; CHECK-LABEL: test_masked_16xfloat_dup_low_mask3: 1343; CHECK: # %bb.0: 1344; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1345; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 1346; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1347; CHECK-NEXT: vmovaps %zmm1, %zmm0 1348; CHECK-NEXT: retq 1349 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1350 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1351 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1352 ret <16 x float> %res 1353} 1354 1355define <16 x float> @test_masked_z_16xfloat_dup_low_mask3(<16 x float> %vec, <16 x float> %mask) { 1356; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask3: 1357; CHECK: # %bb.0: 1358; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1359; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1360; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1361; CHECK-NEXT: retq 1362 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1363 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1364 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1365 ret <16 x float> %res 1366} 1367define <16 x float> @test_masked_16xfloat_dup_low_mask4(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { 1368; CHECK-LABEL: test_masked_16xfloat_dup_low_mask4: 1369; CHECK: # %bb.0: 1370; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 1371; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1 1372; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1373; CHECK-NEXT: vmovaps %zmm1, %zmm0 1374; CHECK-NEXT: retq 1375 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1376 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1377 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1378 ret <16 x float> %res 1379} 1380 1381define <16 x float> @test_masked_z_16xfloat_dup_low_mask4(<16 x float> %vec, <16 x float> %mask) { 1382; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask4: 1383; CHECK: # %bb.0: 1384; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1385; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1386; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1387; CHECK-NEXT: retq 1388 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1389 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1390 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1391 ret <16 x float> %res 1392} 1393define <16 x float> @test_16xfloat_dup_low_mem(<16 x float>* %vp) { 1394; CHECK-LABEL: test_16xfloat_dup_low_mem: 1395; CHECK: # %bb.0: 1396; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1397; CHECK-NEXT: retq 1398 %vec = load <16 x float>, <16 x float>* %vp 1399 %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1400 ret <16 x float> %res 1401} 1402define <16 x float> @test_masked_16xfloat_dup_low_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 1403; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask0: 1404; CHECK: # %bb.0: 1405; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1406; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1407; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1408; CHECK-NEXT: retq 1409 %vec = load <16 x float>, <16 x float>* %vp 1410 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1411 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1412 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1413 ret <16 x float> %res 1414} 1415 1416define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask0(<16 x float>* %vp, <16 x float> %mask) { 1417; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask0: 1418; CHECK: # %bb.0: 1419; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1420; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1421; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1422; CHECK-NEXT: retq 1423 %vec = load <16 x float>, <16 x float>* %vp 1424 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1425 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1426 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1427 ret <16 x float> %res 1428} 1429define <16 x float> @test_masked_16xfloat_dup_low_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 1430; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask1: 1431; CHECK: # %bb.0: 1432; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1433; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1434; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1435; CHECK-NEXT: retq 1436 %vec = load <16 x float>, <16 x float>* %vp 1437 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1438 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1439 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1440 ret <16 x float> %res 1441} 1442 1443define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask1(<16 x float>* %vp, <16 x float> %mask) { 1444; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask1: 1445; CHECK: # %bb.0: 1446; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1447; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1448; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1449; CHECK-NEXT: retq 1450 %vec = load <16 x float>, <16 x float>* %vp 1451 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1452 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1453 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1454 ret <16 x float> %res 1455} 1456define <16 x float> @test_masked_16xfloat_dup_low_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 1457; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask2: 1458; CHECK: # %bb.0: 1459; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1460; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1461; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1462; CHECK-NEXT: retq 1463 %vec = load <16 x float>, <16 x float>* %vp 1464 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1465 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1466 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1467 ret <16 x float> %res 1468} 1469 1470define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask2(<16 x float>* %vp, <16 x float> %mask) { 1471; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask2: 1472; CHECK: # %bb.0: 1473; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1474; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1475; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1476; CHECK-NEXT: retq 1477 %vec = load <16 x float>, <16 x float>* %vp 1478 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1479 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1480 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1481 ret <16 x float> %res 1482} 1483define <16 x float> @test_masked_16xfloat_dup_low_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 1484; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask3: 1485; CHECK: # %bb.0: 1486; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1487; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1488; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1489; CHECK-NEXT: retq 1490 %vec = load <16 x float>, <16 x float>* %vp 1491 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1492 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1493 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1494 ret <16 x float> %res 1495} 1496 1497define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask3(<16 x float>* %vp, <16 x float> %mask) { 1498; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask3: 1499; CHECK: # %bb.0: 1500; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1501; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1502; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1503; CHECK-NEXT: retq 1504 %vec = load <16 x float>, <16 x float>* %vp 1505 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1506 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1507 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1508 ret <16 x float> %res 1509} 1510define <16 x float> @test_masked_16xfloat_dup_low_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { 1511; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask4: 1512; CHECK: # %bb.0: 1513; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 1514; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1 1515; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1516; CHECK-NEXT: retq 1517 %vec = load <16 x float>, <16 x float>* %vp 1518 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1519 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1520 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 1521 ret <16 x float> %res 1522} 1523 1524define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask4(<16 x float>* %vp, <16 x float> %mask) { 1525; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask4: 1526; CHECK: # %bb.0: 1527; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1528; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 1529; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 1530; CHECK-NEXT: retq 1531 %vec = load <16 x float>, <16 x float>* %vp 1532 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 1533 %cmp = fcmp oeq <16 x float> %mask, zeroinitializer 1534 %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer 1535 ret <16 x float> %res 1536} 1537