1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX 7; 8; 32-bit SSE tests to make sure we do reasonable things. 9; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1 10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41 11 12define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp { 13; SSE-LABEL: merge_2f64_f64_23: 14; SSE: # %bb.0: 15; SSE-NEXT: movups 16(%rdi), %xmm0 16; SSE-NEXT: retq 17; 18; AVX-LABEL: merge_2f64_f64_23: 19; AVX: # %bb.0: 20; AVX-NEXT: vmovups 16(%rdi), %xmm0 21; AVX-NEXT: retq 22; 23; X86-SSE1-LABEL: merge_2f64_f64_23: 24; X86-SSE1: # %bb.0: 25; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 26; X86-SSE1-NEXT: fldl 16(%eax) 27; X86-SSE1-NEXT: fldl 24(%eax) 28; X86-SSE1-NEXT: fxch %st(1) 29; X86-SSE1-NEXT: retl 30; 31; X86-SSE41-LABEL: merge_2f64_f64_23: 32; X86-SSE41: # %bb.0: 33; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 34; X86-SSE41-NEXT: movups 16(%eax), %xmm0 35; X86-SSE41-NEXT: retl 36 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2 37 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3 38 %val0 = load double, double* %ptr0 39 %val1 = load double, double* %ptr1 40 %res0 = insertelement <2 x double> undef, double %val0, i32 0 41 %res1 = insertelement <2 x double> %res0, double %val1, i32 1 42 ret <2 x double> %res1 43} 44 45define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp { 46; SSE-LABEL: merge_2i64_i64_12: 47; SSE: # %bb.0: 48; SSE-NEXT: movups 8(%rdi), %xmm0 49; SSE-NEXT: retq 50; 51; AVX-LABEL: merge_2i64_i64_12: 52; AVX: # %bb.0: 53; AVX-NEXT: vmovups 8(%rdi), %xmm0 54; AVX-NEXT: retq 55; 56; X86-SSE1-LABEL: merge_2i64_i64_12: 57; X86-SSE1: # %bb.0: 58; X86-SSE1-NEXT: pushl %edi 59; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 60; X86-SSE1-NEXT: pushl %esi 61; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 62; X86-SSE1-NEXT: .cfi_offset %esi, -12 63; X86-SSE1-NEXT: .cfi_offset %edi, -8 64; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 65; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 66; X86-SSE1-NEXT: movl 8(%ecx), %edx 67; X86-SSE1-NEXT: movl 12(%ecx), %esi 68; X86-SSE1-NEXT: movl 16(%ecx), %edi 69; X86-SSE1-NEXT: movl 20(%ecx), %ecx 70; X86-SSE1-NEXT: movl %ecx, 12(%eax) 71; X86-SSE1-NEXT: movl %edi, 8(%eax) 72; X86-SSE1-NEXT: movl %esi, 4(%eax) 73; X86-SSE1-NEXT: movl %edx, (%eax) 74; X86-SSE1-NEXT: popl %esi 75; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 76; X86-SSE1-NEXT: popl %edi 77; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 78; X86-SSE1-NEXT: retl $4 79; 80; X86-SSE41-LABEL: merge_2i64_i64_12: 81; X86-SSE41: # %bb.0: 82; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 83; X86-SSE41-NEXT: movups 8(%eax), %xmm0 84; X86-SSE41-NEXT: retl 85 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 86 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2 87 %val0 = load i64, i64* %ptr0 88 %val1 = load i64, i64* %ptr1 89 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0 90 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1 91 ret <2 x i64> %res1 92} 93 94define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp { 95; SSE-LABEL: merge_4f32_f32_2345: 96; SSE: # %bb.0: 97; SSE-NEXT: movups 8(%rdi), %xmm0 98; SSE-NEXT: retq 99; 100; AVX-LABEL: merge_4f32_f32_2345: 101; AVX: # %bb.0: 102; AVX-NEXT: vmovups 8(%rdi), %xmm0 103; AVX-NEXT: retq 104; 105; X86-SSE-LABEL: merge_4f32_f32_2345: 106; X86-SSE: # %bb.0: 107; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 108; X86-SSE-NEXT: movups 8(%eax), %xmm0 109; X86-SSE-NEXT: retl 110 %ptr0 = getelementptr inbounds float, float* %ptr, i64 2 111 %ptr1 = getelementptr inbounds float, float* %ptr, i64 3 112 %ptr2 = getelementptr inbounds float, float* %ptr, i64 4 113 %ptr3 = getelementptr inbounds float, float* %ptr, i64 5 114 %val0 = load float, float* %ptr0 115 %val1 = load float, float* %ptr1 116 %val2 = load float, float* %ptr2 117 %val3 = load float, float* %ptr3 118 %res0 = insertelement <4 x float> undef, float %val0, i32 0 119 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 120 %res2 = insertelement <4 x float> %res1, float %val2, i32 2 121 %res3 = insertelement <4 x float> %res2, float %val3, i32 3 122 ret <4 x float> %res3 123} 124 125define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp { 126; SSE-LABEL: merge_4f32_f32_3zuu: 127; SSE: # %bb.0: 128; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 129; SSE-NEXT: retq 130; 131; AVX-LABEL: merge_4f32_f32_3zuu: 132; AVX: # %bb.0: 133; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 134; AVX-NEXT: retq 135; 136; X86-SSE-LABEL: merge_4f32_f32_3zuu: 137; X86-SSE: # %bb.0: 138; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 139; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 140; X86-SSE-NEXT: retl 141 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3 142 %val0 = load float, float* %ptr0 143 %res0 = insertelement <4 x float> undef, float %val0, i32 0 144 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 145 ret <4 x float> %res1 146} 147 148define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp { 149; SSE-LABEL: merge_4f32_f32_34uu: 150; SSE: # %bb.0: 151; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 152; SSE-NEXT: retq 153; 154; AVX-LABEL: merge_4f32_f32_34uu: 155; AVX: # %bb.0: 156; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 157; AVX-NEXT: retq 158; 159; X86-SSE1-LABEL: merge_4f32_f32_34uu: 160; X86-SSE1: # %bb.0: 161; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 162; X86-SSE1-NEXT: xorps %xmm0, %xmm0 163; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 164; X86-SSE1-NEXT: retl 165; 166; X86-SSE41-LABEL: merge_4f32_f32_34uu: 167; X86-SSE41: # %bb.0: 168; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 169; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 170; X86-SSE41-NEXT: retl 171 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3 172 %ptr1 = getelementptr inbounds float, float* %ptr, i64 4 173 %val0 = load float, float* %ptr0 174 %val1 = load float, float* %ptr1 175 %res0 = insertelement <4 x float> undef, float %val0, i32 0 176 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 177 ret <4 x float> %res1 178} 179 180define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp { 181; SSE2-LABEL: merge_4f32_f32_34z6: 182; SSE2: # %bb.0: 183; SSE2-NEXT: movups 12(%rdi), %xmm0 184; SSE2-NEXT: xorps %xmm1, %xmm1 185; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 186; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 187; SSE2-NEXT: retq 188; 189; SSE41-LABEL: merge_4f32_f32_34z6: 190; SSE41: # %bb.0: 191; SSE41-NEXT: movups 12(%rdi), %xmm1 192; SSE41-NEXT: xorps %xmm0, %xmm0 193; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 194; SSE41-NEXT: retq 195; 196; AVX-LABEL: merge_4f32_f32_34z6: 197; AVX: # %bb.0: 198; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 199; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3] 200; AVX-NEXT: retq 201; 202; X86-SSE1-LABEL: merge_4f32_f32_34z6: 203; X86-SSE1: # %bb.0: 204; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 205; X86-SSE1-NEXT: movups 12(%eax), %xmm0 206; X86-SSE1-NEXT: xorps %xmm1, %xmm1 207; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 208; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 209; X86-SSE1-NEXT: retl 210; 211; X86-SSE41-LABEL: merge_4f32_f32_34z6: 212; X86-SSE41: # %bb.0: 213; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 214; X86-SSE41-NEXT: movups 12(%eax), %xmm1 215; X86-SSE41-NEXT: xorps %xmm0, %xmm0 216; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 217; X86-SSE41-NEXT: retl 218 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3 219 %ptr1 = getelementptr inbounds float, float* %ptr, i64 4 220 %ptr3 = getelementptr inbounds float, float* %ptr, i64 6 221 %val0 = load float, float* %ptr0 222 %val1 = load float, float* %ptr1 223 %val3 = load float, float* %ptr3 224 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0 225 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 226 %res3 = insertelement <4 x float> %res1, float %val3, i32 3 227 ret <4 x float> %res3 228} 229 230define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp { 231; SSE-LABEL: merge_4f32_f32_45zz: 232; SSE: # %bb.0: 233; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 234; SSE-NEXT: retq 235; 236; AVX-LABEL: merge_4f32_f32_45zz: 237; AVX: # %bb.0: 238; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 239; AVX-NEXT: retq 240; 241; X86-SSE1-LABEL: merge_4f32_f32_45zz: 242; X86-SSE1: # %bb.0: 243; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 244; X86-SSE1-NEXT: xorps %xmm0, %xmm0 245; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 246; X86-SSE1-NEXT: retl 247; 248; X86-SSE41-LABEL: merge_4f32_f32_45zz: 249; X86-SSE41: # %bb.0: 250; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 251; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 252; X86-SSE41-NEXT: retl 253 %ptr0 = getelementptr inbounds float, float* %ptr, i64 4 254 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5 255 %val0 = load float, float* %ptr0 256 %val1 = load float, float* %ptr1 257 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0 258 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 259 ret <4 x float> %res1 260} 261 262define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp { 263; SSE2-LABEL: merge_4f32_f32_012u: 264; SSE2: # %bb.0: 265; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 266; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 267; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 268; SSE2-NEXT: retq 269; 270; SSE41-LABEL: merge_4f32_f32_012u: 271; SSE41: # %bb.0: 272; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 273; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 274; SSE41-NEXT: retq 275; 276; AVX-LABEL: merge_4f32_f32_012u: 277; AVX: # %bb.0: 278; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 279; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 280; AVX-NEXT: retq 281; 282; X86-SSE1-LABEL: merge_4f32_f32_012u: 283; X86-SSE1: # %bb.0: 284; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 285; X86-SSE1-NEXT: xorps %xmm0, %xmm0 286; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 287; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 288; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 289; X86-SSE1-NEXT: retl 290; 291; X86-SSE41-LABEL: merge_4f32_f32_012u: 292; X86-SSE41: # %bb.0: 293; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 294; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 295; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 296; X86-SSE41-NEXT: retl 297 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 298 %ptr1 = getelementptr inbounds float, float* %ptr, i64 1 299 %ptr2 = getelementptr inbounds float, float* %ptr, i64 2 300 %val0 = load float, float* %ptr0 301 %val1 = load float, float* %ptr1 302 %val2 = load float, float* %ptr2 303 %res0 = insertelement <4 x float> undef, float %val0, i32 0 304 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 305 %res2 = insertelement <4 x float> %res1, float %val2, i32 2 306 %res3 = insertelement <4 x float> %res2, float undef, i32 3 307 ret <4 x float> %res3 308} 309 310define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp { 311; SSE2-LABEL: merge_4f32_f32_019u: 312; SSE2: # %bb.0: 313; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 314; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 315; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 316; SSE2-NEXT: retq 317; 318; SSE41-LABEL: merge_4f32_f32_019u: 319; SSE41: # %bb.0: 320; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 321; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 322; SSE41-NEXT: retq 323; 324; AVX-LABEL: merge_4f32_f32_019u: 325; AVX: # %bb.0: 326; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 327; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 328; AVX-NEXT: retq 329; 330; X86-SSE1-LABEL: merge_4f32_f32_019u: 331; X86-SSE1: # %bb.0: 332; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 333; X86-SSE1-NEXT: xorps %xmm0, %xmm0 334; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 335; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 336; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 337; X86-SSE1-NEXT: retl 338; 339; X86-SSE41-LABEL: merge_4f32_f32_019u: 340; X86-SSE41: # %bb.0: 341; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 342; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 343; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 344; X86-SSE41-NEXT: retl 345 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 346 %ptr1 = getelementptr inbounds float, float* %ptr, i64 1 347 %ptr2 = getelementptr inbounds float, float* %ptr, i64 9 348 %val0 = load float, float* %ptr0 349 %val1 = load float, float* %ptr1 350 %val2 = load float, float* %ptr2 351 %res0 = insertelement <4 x float> undef, float %val0, i32 0 352 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 353 %res2 = insertelement <4 x float> %res1, float %val2, i32 2 354 %res3 = insertelement <4 x float> %res2, float undef, i32 3 355 ret <4 x float> %res3 356} 357 358define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp { 359; SSE-LABEL: merge_4i32_i32_23u5: 360; SSE: # %bb.0: 361; SSE-NEXT: movups 8(%rdi), %xmm0 362; SSE-NEXT: retq 363; 364; AVX-LABEL: merge_4i32_i32_23u5: 365; AVX: # %bb.0: 366; AVX-NEXT: vmovups 8(%rdi), %xmm0 367; AVX-NEXT: retq 368; 369; X86-SSE1-LABEL: merge_4i32_i32_23u5: 370; X86-SSE1: # %bb.0: 371; X86-SSE1-NEXT: pushl %esi 372; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 373; X86-SSE1-NEXT: .cfi_offset %esi, -8 374; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 375; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 376; X86-SSE1-NEXT: movl 8(%ecx), %edx 377; X86-SSE1-NEXT: movl 12(%ecx), %esi 378; X86-SSE1-NEXT: movl 20(%ecx), %ecx 379; X86-SSE1-NEXT: movl %esi, 4(%eax) 380; X86-SSE1-NEXT: movl %edx, (%eax) 381; X86-SSE1-NEXT: movl %ecx, 12(%eax) 382; X86-SSE1-NEXT: popl %esi 383; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 384; X86-SSE1-NEXT: retl $4 385; 386; X86-SSE41-LABEL: merge_4i32_i32_23u5: 387; X86-SSE41: # %bb.0: 388; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 389; X86-SSE41-NEXT: movups 8(%eax), %xmm0 390; X86-SSE41-NEXT: retl 391 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2 392 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3 393 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5 394 %val0 = load i32, i32* %ptr0 395 %val1 = load i32, i32* %ptr1 396 %val3 = load i32, i32* %ptr3 397 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 398 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 399 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3 400 ret <4 x i32> %res3 401} 402 403define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ssp { 404; SSE-LABEL: merge_4i32_i32_23u5_inc2: 405; SSE: # %bb.0: 406; SSE-NEXT: movups 8(%rdi), %xmm0 407; SSE-NEXT: incl 8(%rdi) 408; SSE-NEXT: retq 409; 410; AVX-LABEL: merge_4i32_i32_23u5_inc2: 411; AVX: # %bb.0: 412; AVX-NEXT: vmovups 8(%rdi), %xmm0 413; AVX-NEXT: incl 8(%rdi) 414; AVX-NEXT: retq 415; 416; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc2: 417; X86-SSE1: # %bb.0: 418; X86-SSE1-NEXT: pushl %edi 419; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 420; X86-SSE1-NEXT: pushl %esi 421; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 422; X86-SSE1-NEXT: .cfi_offset %esi, -12 423; X86-SSE1-NEXT: .cfi_offset %edi, -8 424; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 425; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 426; X86-SSE1-NEXT: movl 8(%ecx), %edx 427; X86-SSE1-NEXT: movl 12(%ecx), %esi 428; X86-SSE1-NEXT: leal 1(%edx), %edi 429; X86-SSE1-NEXT: movl %edi, 8(%ecx) 430; X86-SSE1-NEXT: movl 20(%ecx), %ecx 431; X86-SSE1-NEXT: movl %esi, 4(%eax) 432; X86-SSE1-NEXT: movl %edx, (%eax) 433; X86-SSE1-NEXT: movl %ecx, 12(%eax) 434; X86-SSE1-NEXT: popl %esi 435; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 436; X86-SSE1-NEXT: popl %edi 437; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 438; X86-SSE1-NEXT: retl $4 439; 440; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc2: 441; X86-SSE41: # %bb.0: 442; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 443; X86-SSE41-NEXT: movups 8(%eax), %xmm0 444; X86-SSE41-NEXT: incl 8(%eax) 445; X86-SSE41-NEXT: retl 446 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2 447 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3 448 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5 449 %val0 = load i32, i32* %ptr0 450 %inc = add i32 %val0, 1 451 store i32 %inc, i32* %ptr0 452 %val1 = load i32, i32* %ptr1 453 %val3 = load i32, i32* %ptr3 454 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 455 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 456 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3 457 ret <4 x i32> %res3 458} 459 460define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp { 461; SSE-LABEL: merge_4i32_i32_23u5_inc3: 462; SSE: # %bb.0: 463; SSE-NEXT: movups 8(%rdi), %xmm0 464; SSE-NEXT: incl 12(%rdi) 465; SSE-NEXT: retq 466; 467; AVX-LABEL: merge_4i32_i32_23u5_inc3: 468; AVX: # %bb.0: 469; AVX-NEXT: vmovups 8(%rdi), %xmm0 470; AVX-NEXT: incl 12(%rdi) 471; AVX-NEXT: retq 472; 473; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc3: 474; X86-SSE1: # %bb.0: 475; X86-SSE1-NEXT: pushl %edi 476; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 477; X86-SSE1-NEXT: pushl %esi 478; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 479; X86-SSE1-NEXT: .cfi_offset %esi, -12 480; X86-SSE1-NEXT: .cfi_offset %edi, -8 481; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 482; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 483; X86-SSE1-NEXT: movl 8(%ecx), %edx 484; X86-SSE1-NEXT: movl 12(%ecx), %esi 485; X86-SSE1-NEXT: leal 1(%esi), %edi 486; X86-SSE1-NEXT: movl %edi, 12(%ecx) 487; X86-SSE1-NEXT: movl 20(%ecx), %ecx 488; X86-SSE1-NEXT: movl %esi, 4(%eax) 489; X86-SSE1-NEXT: movl %edx, (%eax) 490; X86-SSE1-NEXT: movl %ecx, 12(%eax) 491; X86-SSE1-NEXT: popl %esi 492; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 493; X86-SSE1-NEXT: popl %edi 494; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 495; X86-SSE1-NEXT: retl $4 496; 497; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc3: 498; X86-SSE41: # %bb.0: 499; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 500; X86-SSE41-NEXT: movups 8(%eax), %xmm0 501; X86-SSE41-NEXT: incl 12(%eax) 502; X86-SSE41-NEXT: retl 503 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2 504 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3 505 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5 506 %val0 = load i32, i32* %ptr0 507 %val1 = load i32, i32* %ptr1 508 %inc = add i32 %val1, 1 509 store i32 %inc, i32* %ptr1 510 %val3 = load i32, i32* %ptr3 511 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 512 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 513 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3 514 ret <4 x i32> %res3 515} 516 517define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp { 518; SSE-LABEL: merge_4i32_i32_3zuu: 519; SSE: # %bb.0: 520; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 521; SSE-NEXT: retq 522; 523; AVX-LABEL: merge_4i32_i32_3zuu: 524; AVX: # %bb.0: 525; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 526; AVX-NEXT: retq 527; 528; X86-SSE1-LABEL: merge_4i32_i32_3zuu: 529; X86-SSE1: # %bb.0: 530; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 531; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 532; X86-SSE1-NEXT: movl 12(%ecx), %ecx 533; X86-SSE1-NEXT: movl %ecx, (%eax) 534; X86-SSE1-NEXT: movl $0, 4(%eax) 535; X86-SSE1-NEXT: retl $4 536; 537; X86-SSE41-LABEL: merge_4i32_i32_3zuu: 538; X86-SSE41: # %bb.0: 539; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 540; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 541; X86-SSE41-NEXT: retl 542 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3 543 %val0 = load i32, i32* %ptr0 544 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 545 %res1 = insertelement <4 x i32> %res0, i32 0, i32 1 546 ret <4 x i32> %res1 547} 548 549define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp { 550; SSE-LABEL: merge_4i32_i32_34uu: 551; SSE: # %bb.0: 552; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 553; SSE-NEXT: retq 554; 555; AVX-LABEL: merge_4i32_i32_34uu: 556; AVX: # %bb.0: 557; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 558; AVX-NEXT: retq 559; 560; X86-SSE1-LABEL: merge_4i32_i32_34uu: 561; X86-SSE1: # %bb.0: 562; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 563; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 564; X86-SSE1-NEXT: movl 12(%ecx), %edx 565; X86-SSE1-NEXT: movl 16(%ecx), %ecx 566; X86-SSE1-NEXT: movl %ecx, 4(%eax) 567; X86-SSE1-NEXT: movl %edx, (%eax) 568; X86-SSE1-NEXT: retl $4 569; 570; X86-SSE41-LABEL: merge_4i32_i32_34uu: 571; X86-SSE41: # %bb.0: 572; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 573; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 574; X86-SSE41-NEXT: retl 575 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3 576 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4 577 %val0 = load i32, i32* %ptr0 578 %val1 = load i32, i32* %ptr1 579 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0 580 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 581 ret <4 x i32> %res1 582} 583 584define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp { 585; SSE-LABEL: merge_4i32_i32_45zz: 586; SSE: # %bb.0: 587; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 588; SSE-NEXT: retq 589; 590; AVX-LABEL: merge_4i32_i32_45zz: 591; AVX: # %bb.0: 592; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 593; AVX-NEXT: retq 594; 595; X86-SSE1-LABEL: merge_4i32_i32_45zz: 596; X86-SSE1: # %bb.0: 597; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 598; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 599; X86-SSE1-NEXT: movl 16(%ecx), %edx 600; X86-SSE1-NEXT: movl 20(%ecx), %ecx 601; X86-SSE1-NEXT: movl %ecx, 4(%eax) 602; X86-SSE1-NEXT: movl %edx, (%eax) 603; X86-SSE1-NEXT: movl $0, 12(%eax) 604; X86-SSE1-NEXT: movl $0, 8(%eax) 605; X86-SSE1-NEXT: retl $4 606; 607; X86-SSE41-LABEL: merge_4i32_i32_45zz: 608; X86-SSE41: # %bb.0: 609; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 610; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 611; X86-SSE41-NEXT: retl 612 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4 613 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5 614 %val0 = load i32, i32* %ptr0 615 %val1 = load i32, i32* %ptr1 616 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0 617 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 618 ret <4 x i32> %res1 619} 620 621define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ssp { 622; SSE-LABEL: merge_4i32_i32_45zz_inc4: 623; SSE: # %bb.0: 624; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 625; SSE-NEXT: incl 16(%rdi) 626; SSE-NEXT: retq 627; 628; AVX-LABEL: merge_4i32_i32_45zz_inc4: 629; AVX: # %bb.0: 630; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 631; AVX-NEXT: incl 16(%rdi) 632; AVX-NEXT: retq 633; 634; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc4: 635; X86-SSE1: # %bb.0: 636; X86-SSE1-NEXT: pushl %edi 637; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 638; X86-SSE1-NEXT: pushl %esi 639; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 640; X86-SSE1-NEXT: .cfi_offset %esi, -12 641; X86-SSE1-NEXT: .cfi_offset %edi, -8 642; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 643; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 644; X86-SSE1-NEXT: movl 16(%ecx), %edx 645; X86-SSE1-NEXT: movl 20(%ecx), %esi 646; X86-SSE1-NEXT: leal 1(%edx), %edi 647; X86-SSE1-NEXT: movl %edi, 16(%ecx) 648; X86-SSE1-NEXT: movl %esi, 4(%eax) 649; X86-SSE1-NEXT: movl %edx, (%eax) 650; X86-SSE1-NEXT: movl $0, 12(%eax) 651; X86-SSE1-NEXT: movl $0, 8(%eax) 652; X86-SSE1-NEXT: popl %esi 653; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 654; X86-SSE1-NEXT: popl %edi 655; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 656; X86-SSE1-NEXT: retl $4 657; 658; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc4: 659; X86-SSE41: # %bb.0: 660; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 661; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 662; X86-SSE41-NEXT: incl 16(%eax) 663; X86-SSE41-NEXT: retl 664 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4 665 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5 666 %val0 = load i32, i32* %ptr0 667 %inc = add i32 %val0, 1 668 store i32 %inc, i32* %ptr0 669 %val1 = load i32, i32* %ptr1 670 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0 671 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 672 ret <4 x i32> %res1 673} 674 675define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ssp { 676; SSE-LABEL: merge_4i32_i32_45zz_inc5: 677; SSE: # %bb.0: 678; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 679; SSE-NEXT: incl 20(%rdi) 680; SSE-NEXT: retq 681; 682; AVX-LABEL: merge_4i32_i32_45zz_inc5: 683; AVX: # %bb.0: 684; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 685; AVX-NEXT: incl 20(%rdi) 686; AVX-NEXT: retq 687; 688; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc5: 689; X86-SSE1: # %bb.0: 690; X86-SSE1-NEXT: pushl %edi 691; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 692; X86-SSE1-NEXT: pushl %esi 693; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 694; X86-SSE1-NEXT: .cfi_offset %esi, -12 695; X86-SSE1-NEXT: .cfi_offset %edi, -8 696; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 697; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 698; X86-SSE1-NEXT: movl 16(%ecx), %edx 699; X86-SSE1-NEXT: movl 20(%ecx), %esi 700; X86-SSE1-NEXT: leal 1(%esi), %edi 701; X86-SSE1-NEXT: movl %edi, 20(%ecx) 702; X86-SSE1-NEXT: movl %esi, 4(%eax) 703; X86-SSE1-NEXT: movl %edx, (%eax) 704; X86-SSE1-NEXT: movl $0, 12(%eax) 705; X86-SSE1-NEXT: movl $0, 8(%eax) 706; X86-SSE1-NEXT: popl %esi 707; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 708; X86-SSE1-NEXT: popl %edi 709; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 710; X86-SSE1-NEXT: retl $4 711; 712; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc5: 713; X86-SSE41: # %bb.0: 714; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 715; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 716; X86-SSE41-NEXT: incl 20(%eax) 717; X86-SSE41-NEXT: retl 718 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4 719 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5 720 %val0 = load i32, i32* %ptr0 721 %val1 = load i32, i32* %ptr1 722 %inc = add i32 %val1, 1 723 store i32 %inc, i32* %ptr1 724 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0 725 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1 726 ret <4 x i32> %res1 727} 728 729define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp { 730; SSE-LABEL: merge_8i16_i16_23u567u9: 731; SSE: # %bb.0: 732; SSE-NEXT: movups 4(%rdi), %xmm0 733; SSE-NEXT: retq 734; 735; AVX-LABEL: merge_8i16_i16_23u567u9: 736; AVX: # %bb.0: 737; AVX-NEXT: vmovups 4(%rdi), %xmm0 738; AVX-NEXT: retq 739; 740; X86-SSE1-LABEL: merge_8i16_i16_23u567u9: 741; X86-SSE1: # %bb.0: 742; X86-SSE1-NEXT: pushl %edi 743; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 744; X86-SSE1-NEXT: pushl %esi 745; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 746; X86-SSE1-NEXT: .cfi_offset %esi, -12 747; X86-SSE1-NEXT: .cfi_offset %edi, -8 748; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 749; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 750; X86-SSE1-NEXT: movl 4(%ecx), %edx 751; X86-SSE1-NEXT: movl 10(%ecx), %esi 752; X86-SSE1-NEXT: movzwl 14(%ecx), %edi 753; X86-SSE1-NEXT: movzwl 18(%ecx), %ecx 754; X86-SSE1-NEXT: movw %di, 10(%eax) 755; X86-SSE1-NEXT: movw %cx, 14(%eax) 756; X86-SSE1-NEXT: movl %esi, 6(%eax) 757; X86-SSE1-NEXT: movl %edx, (%eax) 758; X86-SSE1-NEXT: popl %esi 759; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 760; X86-SSE1-NEXT: popl %edi 761; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 762; X86-SSE1-NEXT: retl $4 763; 764; X86-SSE41-LABEL: merge_8i16_i16_23u567u9: 765; X86-SSE41: # %bb.0: 766; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 767; X86-SSE41-NEXT: movups 4(%eax), %xmm0 768; X86-SSE41-NEXT: retl 769 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2 770 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3 771 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5 772 %ptr4 = getelementptr inbounds i16, i16* %ptr, i64 6 773 %ptr5 = getelementptr inbounds i16, i16* %ptr, i64 7 774 %ptr7 = getelementptr inbounds i16, i16* %ptr, i64 9 775 %val0 = load i16, i16* %ptr0 776 %val1 = load i16, i16* %ptr1 777 %val3 = load i16, i16* %ptr3 778 %val4 = load i16, i16* %ptr4 779 %val5 = load i16, i16* %ptr5 780 %val7 = load i16, i16* %ptr7 781 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0 782 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1 783 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3 784 %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4 785 %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5 786 %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7 787 ret <8 x i16> %res7 788} 789 790define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp { 791; SSE-LABEL: merge_8i16_i16_34uuuuuu: 792; SSE: # %bb.0: 793; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 794; SSE-NEXT: retq 795; 796; AVX-LABEL: merge_8i16_i16_34uuuuuu: 797; AVX: # %bb.0: 798; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 799; AVX-NEXT: retq 800; 801; X86-SSE1-LABEL: merge_8i16_i16_34uuuuuu: 802; X86-SSE1: # %bb.0: 803; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 804; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 805; X86-SSE1-NEXT: movl 6(%ecx), %ecx 806; X86-SSE1-NEXT: movl %ecx, (%eax) 807; X86-SSE1-NEXT: retl $4 808; 809; X86-SSE41-LABEL: merge_8i16_i16_34uuuuuu: 810; X86-SSE41: # %bb.0: 811; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 812; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 813; X86-SSE41-NEXT: retl 814 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3 815 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4 816 %val0 = load i16, i16* %ptr0 817 %val1 = load i16, i16* %ptr1 818 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0 819 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1 820 ret <8 x i16> %res1 821} 822 823define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp { 824; SSE-LABEL: merge_8i16_i16_45u7zzzz: 825; SSE: # %bb.0: 826; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 827; SSE-NEXT: retq 828; 829; AVX-LABEL: merge_8i16_i16_45u7zzzz: 830; AVX: # %bb.0: 831; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 832; AVX-NEXT: retq 833; 834; X86-SSE1-LABEL: merge_8i16_i16_45u7zzzz: 835; X86-SSE1: # %bb.0: 836; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 837; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 838; X86-SSE1-NEXT: movl 8(%ecx), %edx 839; X86-SSE1-NEXT: movzwl 14(%ecx), %ecx 840; X86-SSE1-NEXT: movw %cx, 6(%eax) 841; X86-SSE1-NEXT: movl %edx, (%eax) 842; X86-SSE1-NEXT: movl $0, 12(%eax) 843; X86-SSE1-NEXT: movl $0, 8(%eax) 844; X86-SSE1-NEXT: retl $4 845; 846; X86-SSE41-LABEL: merge_8i16_i16_45u7zzzz: 847; X86-SSE41: # %bb.0: 848; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 849; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 850; X86-SSE41-NEXT: retl 851 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4 852 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5 853 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7 854 %val0 = load i16, i16* %ptr0 855 %val1 = load i16, i16* %ptr1 856 %val3 = load i16, i16* %ptr3 857 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0 858 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1 859 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3 860 %res4 = insertelement <8 x i16> %res3, i16 0, i32 4 861 %res5 = insertelement <8 x i16> %res4, i16 0, i32 5 862 %res6 = insertelement <8 x i16> %res5, i16 0, i32 6 863 %res7 = insertelement <8 x i16> %res6, i16 0, i32 7 864 ret <8 x i16> %res7 865} 866 867define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp { 868; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF: 869; SSE: # %bb.0: 870; SSE-NEXT: movups (%rdi), %xmm0 871; SSE-NEXT: retq 872; 873; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF: 874; AVX: # %bb.0: 875; AVX-NEXT: vmovups (%rdi), %xmm0 876; AVX-NEXT: retq 877; 878; X86-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF: 879; X86-SSE1: # %bb.0: 880; X86-SSE1-NEXT: pushl %ebp 881; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 882; X86-SSE1-NEXT: pushl %ebx 883; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 884; X86-SSE1-NEXT: pushl %edi 885; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 886; X86-SSE1-NEXT: pushl %esi 887; X86-SSE1-NEXT: .cfi_def_cfa_offset 20 888; X86-SSE1-NEXT: .cfi_offset %esi, -20 889; X86-SSE1-NEXT: .cfi_offset %edi, -16 890; X86-SSE1-NEXT: .cfi_offset %ebx, -12 891; X86-SSE1-NEXT: .cfi_offset %ebp, -8 892; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 893; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 894; X86-SSE1-NEXT: movzwl (%ecx), %ebp 895; X86-SSE1-NEXT: movl 3(%ecx), %esi 896; X86-SSE1-NEXT: movl 7(%ecx), %edi 897; X86-SSE1-NEXT: movzwl 11(%ecx), %ebx 898; X86-SSE1-NEXT: movb 13(%ecx), %dl 899; X86-SSE1-NEXT: movb 15(%ecx), %cl 900; X86-SSE1-NEXT: movb %dl, 13(%eax) 901; X86-SSE1-NEXT: movb %cl, 15(%eax) 902; X86-SSE1-NEXT: movw %bx, 11(%eax) 903; X86-SSE1-NEXT: movl %edi, 7(%eax) 904; X86-SSE1-NEXT: movl %esi, 3(%eax) 905; X86-SSE1-NEXT: movw %bp, (%eax) 906; X86-SSE1-NEXT: popl %esi 907; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 908; X86-SSE1-NEXT: popl %edi 909; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 910; X86-SSE1-NEXT: popl %ebx 911; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 912; X86-SSE1-NEXT: popl %ebp 913; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 914; X86-SSE1-NEXT: retl $4 915; 916; X86-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF: 917; X86-SSE41: # %bb.0: 918; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 919; X86-SSE41-NEXT: movups (%eax), %xmm0 920; X86-SSE41-NEXT: retl 921 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0 922 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1 923 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3 924 %ptr4 = getelementptr inbounds i8, i8* %ptr, i64 4 925 %ptr5 = getelementptr inbounds i8, i8* %ptr, i64 5 926 %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6 927 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7 928 %ptr8 = getelementptr inbounds i8, i8* %ptr, i64 8 929 %ptr9 = getelementptr inbounds i8, i8* %ptr, i64 9 930 %ptrA = getelementptr inbounds i8, i8* %ptr, i64 10 931 %ptrB = getelementptr inbounds i8, i8* %ptr, i64 11 932 %ptrC = getelementptr inbounds i8, i8* %ptr, i64 12 933 %ptrD = getelementptr inbounds i8, i8* %ptr, i64 13 934 %ptrF = getelementptr inbounds i8, i8* %ptr, i64 15 935 %val0 = load i8, i8* %ptr0 936 %val1 = load i8, i8* %ptr1 937 %val3 = load i8, i8* %ptr3 938 %val4 = load i8, i8* %ptr4 939 %val5 = load i8, i8* %ptr5 940 %val6 = load i8, i8* %ptr6 941 %val7 = load i8, i8* %ptr7 942 %val8 = load i8, i8* %ptr8 943 %val9 = load i8, i8* %ptr9 944 %valA = load i8, i8* %ptrA 945 %valB = load i8, i8* %ptrB 946 %valC = load i8, i8* %ptrC 947 %valD = load i8, i8* %ptrD 948 %valF = load i8, i8* %ptrF 949 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0 950 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1 951 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3 952 %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4 953 %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5 954 %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6 955 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7 956 %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8 957 %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9 958 %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10 959 %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11 960 %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12 961 %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13 962 %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15 963 ret <16 x i8> %resF 964} 965 966define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp { 967; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: 968; SSE: # %bb.0: 969; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 970; SSE-NEXT: retq 971; 972; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: 973; AVX: # %bb.0: 974; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 975; AVX-NEXT: retq 976; 977; X86-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: 978; X86-SSE1: # %bb.0: 979; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 980; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 981; X86-SSE1-NEXT: movzwl (%ecx), %edx 982; X86-SSE1-NEXT: movb 3(%ecx), %cl 983; X86-SSE1-NEXT: movb %cl, 3(%eax) 984; X86-SSE1-NEXT: movw %dx, (%eax) 985; X86-SSE1-NEXT: movb $0, 15(%eax) 986; X86-SSE1-NEXT: movw $0, 13(%eax) 987; X86-SSE1-NEXT: movw $0, 6(%eax) 988; X86-SSE1-NEXT: retl $4 989; 990; X86-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: 991; X86-SSE41: # %bb.0: 992; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 993; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 994; X86-SSE41-NEXT: retl 995 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0 996 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1 997 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3 998 %val0 = load i8, i8* %ptr0 999 %val1 = load i8, i8* %ptr1 1000 %val3 = load i8, i8* %ptr3 1001 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0 1002 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1 1003 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3 1004 %res6 = insertelement <16 x i8> %res3, i8 0, i32 6 1005 %res7 = insertelement <16 x i8> %res6, i8 0, i32 7 1006 %resD = insertelement <16 x i8> %res7, i8 0, i32 13 1007 %resE = insertelement <16 x i8> %resD, i8 0, i32 14 1008 %resF = insertelement <16 x i8> %resE, i8 0, i32 15 1009 ret <16 x i8> %resF 1010} 1011 1012define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp { 1013; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: 1014; SSE: # %bb.0: 1015; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1016; SSE-NEXT: retq 1017; 1018; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: 1019; AVX: # %bb.0: 1020; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1021; AVX-NEXT: retq 1022; 1023; X86-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: 1024; X86-SSE1: # %bb.0: 1025; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1026; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1027; X86-SSE1-NEXT: movl (%ecx), %edx 1028; X86-SSE1-NEXT: movzwl 6(%ecx), %ecx 1029; X86-SSE1-NEXT: movw %cx, 6(%eax) 1030; X86-SSE1-NEXT: movl %edx, (%eax) 1031; X86-SSE1-NEXT: movb $0, 15(%eax) 1032; X86-SSE1-NEXT: movw $0, 13(%eax) 1033; X86-SSE1-NEXT: retl $4 1034; 1035; X86-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: 1036; X86-SSE41: # %bb.0: 1037; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1038; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1039; X86-SSE41-NEXT: retl 1040 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0 1041 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1 1042 %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2 1043 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3 1044 %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6 1045 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7 1046 %val0 = load i8, i8* %ptr0 1047 %val1 = load i8, i8* %ptr1 1048 %val2 = load i8, i8* %ptr2 1049 %val3 = load i8, i8* %ptr3 1050 %val6 = load i8, i8* %ptr6 1051 %val7 = load i8, i8* %ptr7 1052 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0 1053 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1 1054 %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2 1055 %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3 1056 %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6 1057 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7 1058 %resD = insertelement <16 x i8> %res7, i8 0, i32 13 1059 %resE = insertelement <16 x i8> %resD, i8 0, i32 14 1060 %resF = insertelement <16 x i8> %resE, i8 0, i32 15 1061 ret <16 x i8> %resF 1062} 1063 1064define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) { 1065; SSE-LABEL: merge_4i32_i32_combine: 1066; SSE: # %bb.0: 1067; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1068; SSE-NEXT: movaps %xmm0, (%rdi) 1069; SSE-NEXT: retq 1070; 1071; AVX-LABEL: merge_4i32_i32_combine: 1072; AVX: # %bb.0: 1073; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1074; AVX-NEXT: vmovaps %xmm0, (%rdi) 1075; AVX-NEXT: retq 1076; 1077; X86-SSE1-LABEL: merge_4i32_i32_combine: 1078; X86-SSE1: # %bb.0: 1079; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1080; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1081; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1082; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1083; X86-SSE1-NEXT: andps %xmm0, %xmm1 1084; X86-SSE1-NEXT: movaps %xmm1, (%eax) 1085; X86-SSE1-NEXT: retl 1086; 1087; X86-SSE41-LABEL: merge_4i32_i32_combine: 1088; X86-SSE41: # %bb.0: 1089; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1090; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx 1091; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1092; X86-SSE41-NEXT: movaps %xmm0, (%eax) 1093; X86-SSE41-NEXT: retl 1094 %1 = getelementptr i32, i32* %src, i32 0 1095 %2 = load i32, i32* %1 1096 %3 = insertelement <4 x i32> undef, i32 %2, i32 0 1097 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer 1098 %5 = lshr <4 x i32> %4, <i32 0, i32 undef, i32 undef, i32 undef> 1099 %6 = and <4 x i32> %5, <i32 -1, i32 0, i32 0, i32 0> 1100 store <4 x i32> %6, <4 x i32>* %dst 1101 ret void 1102} 1103 1104; 1105; consecutive loads including any/all volatiles may not be combined 1106; 1107 1108define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp { 1109; SSE-LABEL: merge_2i64_i64_12_volatile: 1110; SSE: # %bb.0: 1111; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1112; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 1113; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1114; SSE-NEXT: retq 1115; 1116; AVX-LABEL: merge_2i64_i64_12_volatile: 1117; AVX: # %bb.0: 1118; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1119; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1120; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1121; AVX-NEXT: retq 1122; 1123; X86-SSE1-LABEL: merge_2i64_i64_12_volatile: 1124; X86-SSE1: # %bb.0: 1125; X86-SSE1-NEXT: pushl %edi 1126; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 1127; X86-SSE1-NEXT: pushl %esi 1128; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 1129; X86-SSE1-NEXT: .cfi_offset %esi, -12 1130; X86-SSE1-NEXT: .cfi_offset %edi, -8 1131; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1132; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1133; X86-SSE1-NEXT: movl 8(%ecx), %edx 1134; X86-SSE1-NEXT: movl 12(%ecx), %esi 1135; X86-SSE1-NEXT: movl 16(%ecx), %edi 1136; X86-SSE1-NEXT: movl 20(%ecx), %ecx 1137; X86-SSE1-NEXT: movl %ecx, 12(%eax) 1138; X86-SSE1-NEXT: movl %edi, 8(%eax) 1139; X86-SSE1-NEXT: movl %esi, 4(%eax) 1140; X86-SSE1-NEXT: movl %edx, (%eax) 1141; X86-SSE1-NEXT: popl %esi 1142; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 1143; X86-SSE1-NEXT: popl %edi 1144; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 1145; X86-SSE1-NEXT: retl $4 1146; 1147; X86-SSE41-LABEL: merge_2i64_i64_12_volatile: 1148; X86-SSE41: # %bb.0: 1149; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1150; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1151; X86-SSE41-NEXT: pinsrd $1, 12(%eax), %xmm0 1152; X86-SSE41-NEXT: pinsrd $2, 16(%eax), %xmm0 1153; X86-SSE41-NEXT: pinsrd $3, 20(%eax), %xmm0 1154; X86-SSE41-NEXT: retl 1155 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 1156 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2 1157 %val0 = load volatile i64, i64* %ptr0 1158 %val1 = load volatile i64, i64* %ptr1 1159 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0 1160 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1 1161 ret <2 x i64> %res1 1162} 1163 1164define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp { 1165; SSE2-LABEL: merge_4f32_f32_2345_volatile: 1166; SSE2: # %bb.0: 1167; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1168; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1169; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1170; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1171; SSE2-NEXT: retq 1172; 1173; SSE41-LABEL: merge_4f32_f32_2345_volatile: 1174; SSE41: # %bb.0: 1175; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1176; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 1177; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1178; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 1179; SSE41-NEXT: retq 1180; 1181; AVX-LABEL: merge_4f32_f32_2345_volatile: 1182; AVX: # %bb.0: 1183; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1184; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 1185; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1186; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 1187; AVX-NEXT: retq 1188; 1189; X86-SSE1-LABEL: merge_4f32_f32_2345_volatile: 1190; X86-SSE1: # %bb.0: 1191; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1192; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1193; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1194; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1195; X86-SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1196; X86-SSE1-NEXT: retl 1197; 1198; X86-SSE41-LABEL: merge_4f32_f32_2345_volatile: 1199; X86-SSE41: # %bb.0: 1200; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1201; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1202; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 1203; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 1204; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 1205; X86-SSE41-NEXT: retl 1206 %ptr0 = getelementptr inbounds float, float* %ptr, i64 2 1207 %ptr1 = getelementptr inbounds float, float* %ptr, i64 3 1208 %ptr2 = getelementptr inbounds float, float* %ptr, i64 4 1209 %ptr3 = getelementptr inbounds float, float* %ptr, i64 5 1210 %val0 = load volatile float, float* %ptr0 1211 %val1 = load float, float* %ptr1 1212 %val2 = load float, float* %ptr2 1213 %val3 = load float, float* %ptr3 1214 %res0 = insertelement <4 x float> undef, float %val0, i32 0 1215 %res1 = insertelement <4 x float> %res0, float %val1, i32 1 1216 %res2 = insertelement <4 x float> %res1, float %val2, i32 2 1217 %res3 = insertelement <4 x float> %res2, float %val3, i32 3 1218 ret <4 x float> %res3 1219} 1220 1221; 1222; Non-consecutive test. 1223; 1224 1225define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp { 1226; SSE-LABEL: merge_4f32_f32_X0YY: 1227; SSE: # %bb.0: 1228; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1229; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1230; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 1231; SSE-NEXT: retq 1232; 1233; AVX-LABEL: merge_4f32_f32_X0YY: 1234; AVX: # %bb.0: 1235; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1236; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1237; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] 1238; AVX-NEXT: retq 1239; 1240; X86-SSE-LABEL: merge_4f32_f32_X0YY: 1241; X86-SSE: # %bb.0: 1242; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1243; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1244; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1245; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1246; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 1247; X86-SSE-NEXT: retl 1248 %val0 = load float, float* %ptr0, align 4 1249 %val1 = load float, float* %ptr1, align 4 1250 %res0 = insertelement <4 x float> undef, float %val0, i32 0 1251 %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1 1252 %res2 = insertelement <4 x float> %res1, float %val1, i32 2 1253 %res3 = insertelement <4 x float> %res2, float %val1, i32 3 1254 ret <4 x float> %res3 1255} 1256 1257; 1258; Extension tests. 1259; 1260 1261; PR31309 1262define <4 x i32> @load_i32_zext_i128_v4i32(i32* %ptr) { 1263; SSE-LABEL: load_i32_zext_i128_v4i32: 1264; SSE: # %bb.0: 1265; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1266; SSE-NEXT: retq 1267; 1268; AVX-LABEL: load_i32_zext_i128_v4i32: 1269; AVX: # %bb.0: 1270; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1271; AVX-NEXT: retq 1272; 1273; X86-SSE1-LABEL: load_i32_zext_i128_v4i32: 1274; X86-SSE1: # %bb.0: 1275; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax 1276; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1277; X86-SSE1-NEXT: movl (%ecx), %ecx 1278; X86-SSE1-NEXT: movl %ecx, (%eax) 1279; X86-SSE1-NEXT: movl $0, 12(%eax) 1280; X86-SSE1-NEXT: movl $0, 8(%eax) 1281; X86-SSE1-NEXT: movl $0, 4(%eax) 1282; X86-SSE1-NEXT: retl $4 1283; 1284; X86-SSE41-LABEL: load_i32_zext_i128_v4i32: 1285; X86-SSE41: # %bb.0: 1286; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax 1287; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1288; X86-SSE41-NEXT: retl 1289 %1 = load i32, i32* %ptr 1290 %2 = zext i32 %1 to i128 1291 %3 = bitcast i128 %2 to <4 x i32> 1292 ret <4 x i32> %3 1293} 1294