1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X86,X86-SSE4A 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64-SSE,X64-SSE2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X64-SSE,X64-SSE4A 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X64-SSE,X64-SSE41 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2 9 10; 11; PR42123 12; 13 14define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) nounwind { 15; X86-LABEL: merge_2_v4f32_align32: 16; X86: # %bb.0: 17; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 18; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 19; X86-NEXT: movaps (%ecx), %xmm0 20; X86-NEXT: movaps 16(%ecx), %xmm1 21; X86-NEXT: movntps %xmm0, (%eax) 22; X86-NEXT: movntps %xmm1, 16(%eax) 23; X86-NEXT: retl 24; 25; X64-SSE2-LABEL: merge_2_v4f32_align32: 26; X64-SSE2: # %bb.0: 27; X64-SSE2-NEXT: movaps (%rdi), %xmm0 28; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 29; X64-SSE2-NEXT: movntps %xmm0, (%rsi) 30; X64-SSE2-NEXT: movntps %xmm1, 16(%rsi) 31; X64-SSE2-NEXT: retq 32; 33; X64-SSE4A-LABEL: merge_2_v4f32_align32: 34; X64-SSE4A: # %bb.0: 35; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 36; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 37; X64-SSE4A-NEXT: movntps %xmm0, (%rsi) 38; X64-SSE4A-NEXT: movntps %xmm1, 16(%rsi) 39; X64-SSE4A-NEXT: retq 40; 41; X64-SSE41-LABEL: merge_2_v4f32_align32: 42; X64-SSE41: # %bb.0: 43; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 44; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1 45; X64-SSE41-NEXT: movntdq %xmm0, (%rsi) 46; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi) 47; X64-SSE41-NEXT: retq 48; 49; X64-AVX1-LABEL: merge_2_v4f32_align32: 50; X64-AVX1: # %bb.0: 51; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 52; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm1 53; X64-AVX1-NEXT: vmovntdq %xmm1, (%rsi) 54; X64-AVX1-NEXT: vmovntdq %xmm0, 16(%rsi) 55; X64-AVX1-NEXT: retq 56; 57; X64-AVX2-LABEL: merge_2_v4f32_align32: 58; X64-AVX2: # %bb.0: 59; X64-AVX2-NEXT: vmovntdqa (%rdi), %ymm0 60; X64-AVX2-NEXT: vmovntdq %ymm0, (%rsi) 61; X64-AVX2-NEXT: vzeroupper 62; X64-AVX2-NEXT: retq 63 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 64 %2 = bitcast float* %1 to <4 x float>* 65 %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0 66 %4 = load <4 x float>, <4 x float>* %2, align 16, !nontemporal !0 67 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 68 %6 = bitcast float* %5 to <4 x float>* 69 store <4 x float> %3, <4 x float>* %a1, align 32, !nontemporal !0 70 store <4 x float> %4, <4 x float>* %6, align 16, !nontemporal !0 71 ret void 72} 73 74; Don't merge nt and non-nt loads even if aligned. 75define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind { 76; X86-LABEL: merge_2_v4f32_align32_mix_ntload: 77; X86: # %bb.0: 78; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 79; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 80; X86-NEXT: movaps (%ecx), %xmm0 81; X86-NEXT: movaps 16(%ecx), %xmm1 82; X86-NEXT: movaps %xmm0, (%eax) 83; X86-NEXT: movaps %xmm1, 16(%eax) 84; X86-NEXT: retl 85; 86; X64-SSE2-LABEL: merge_2_v4f32_align32_mix_ntload: 87; X64-SSE2: # %bb.0: 88; X64-SSE2-NEXT: movaps (%rdi), %xmm0 89; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 90; X64-SSE2-NEXT: movaps %xmm0, (%rsi) 91; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi) 92; X64-SSE2-NEXT: retq 93; 94; X64-SSE4A-LABEL: merge_2_v4f32_align32_mix_ntload: 95; X64-SSE4A: # %bb.0: 96; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 97; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 98; X64-SSE4A-NEXT: movaps %xmm0, (%rsi) 99; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi) 100; X64-SSE4A-NEXT: retq 101; 102; X64-SSE41-LABEL: merge_2_v4f32_align32_mix_ntload: 103; X64-SSE41: # %bb.0: 104; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 105; X64-SSE41-NEXT: movaps 16(%rdi), %xmm1 106; X64-SSE41-NEXT: movdqa %xmm0, (%rsi) 107; X64-SSE41-NEXT: movaps %xmm1, 16(%rsi) 108; X64-SSE41-NEXT: retq 109; 110; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload: 111; X64-AVX: # %bb.0: 112; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 113; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 114; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) 115; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) 116; X64-AVX-NEXT: retq 117 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 118 %2 = bitcast float* %1 to <4 x float>* 119 %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0 120 %4 = load <4 x float>, <4 x float>* %2, align 16 121 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 122 %6 = bitcast float* %5 to <4 x float>* 123 store <4 x float> %3, <4 x float>* %a1, align 32 124 store <4 x float> %4, <4 x float>* %6, align 16 125 ret void 126} 127 128; Don't merge nt and non-nt stores even if aligned. 129define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind { 130; X86-LABEL: merge_2_v4f32_align32_mix_ntstore: 131; X86: # %bb.0: 132; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 133; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 134; X86-NEXT: movaps (%ecx), %xmm0 135; X86-NEXT: movaps 16(%ecx), %xmm1 136; X86-NEXT: movntps %xmm0, (%eax) 137; X86-NEXT: movaps %xmm1, 16(%eax) 138; X86-NEXT: retl 139; 140; X64-SSE-LABEL: merge_2_v4f32_align32_mix_ntstore: 141; X64-SSE: # %bb.0: 142; X64-SSE-NEXT: movaps (%rdi), %xmm0 143; X64-SSE-NEXT: movaps 16(%rdi), %xmm1 144; X64-SSE-NEXT: movntps %xmm0, (%rsi) 145; X64-SSE-NEXT: movaps %xmm1, 16(%rsi) 146; X64-SSE-NEXT: retq 147; 148; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore: 149; X64-AVX: # %bb.0: 150; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 151; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 152; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) 153; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) 154; X64-AVX-NEXT: retq 155 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 156 %2 = bitcast float* %1 to <4 x float>* 157 %3 = load <4 x float>, <4 x float>* %a0, align 32 158 %4 = load <4 x float>, <4 x float>* %2, align 16 159 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 160 %6 = bitcast float* %5 to <4 x float>* 161 store <4 x float> %3, <4 x float>* %a1, align 32, !nontemporal !0 162 store <4 x float> %4, <4 x float>* %6, align 16 163 ret void 164} 165 166; AVX2 can't perform NT-load-ymm on 16-byte aligned memory. 167; Must be kept seperate as VMOVNTDQA xmm. 168define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind { 169; X86-LABEL: merge_2_v4f32_align16_ntload: 170; X86: # %bb.0: 171; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 172; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 173; X86-NEXT: movaps (%ecx), %xmm0 174; X86-NEXT: movaps 16(%ecx), %xmm1 175; X86-NEXT: movaps %xmm0, (%eax) 176; X86-NEXT: movaps %xmm1, 16(%eax) 177; X86-NEXT: retl 178; 179; X64-SSE2-LABEL: merge_2_v4f32_align16_ntload: 180; X64-SSE2: # %bb.0: 181; X64-SSE2-NEXT: movaps (%rdi), %xmm0 182; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 183; X64-SSE2-NEXT: movaps %xmm0, (%rsi) 184; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi) 185; X64-SSE2-NEXT: retq 186; 187; X64-SSE4A-LABEL: merge_2_v4f32_align16_ntload: 188; X64-SSE4A: # %bb.0: 189; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 190; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 191; X64-SSE4A-NEXT: movaps %xmm0, (%rsi) 192; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi) 193; X64-SSE4A-NEXT: retq 194; 195; X64-SSE41-LABEL: merge_2_v4f32_align16_ntload: 196; X64-SSE41: # %bb.0: 197; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 198; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1 199; X64-SSE41-NEXT: movdqa %xmm0, (%rsi) 200; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi) 201; X64-SSE41-NEXT: retq 202; 203; X64-AVX-LABEL: merge_2_v4f32_align16_ntload: 204; X64-AVX: # %bb.0: 205; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 206; X64-AVX-NEXT: vmovntdqa 16(%rdi), %xmm1 207; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) 208; X64-AVX-NEXT: vmovdqa %xmm1, 16(%rsi) 209; X64-AVX-NEXT: retq 210 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 211 %2 = bitcast float* %1 to <4 x float>* 212 %3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0 213 %4 = load <4 x float>, <4 x float>* %2, align 16, !nontemporal !0 214 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 215 %6 = bitcast float* %5 to <4 x float>* 216 store <4 x float> %3, <4 x float>* %a1, align 16 217 store <4 x float> %4, <4 x float>* %6, align 16 218 ret void 219} 220 221; AVX can't perform NT-store-ymm on 16-byte aligned memory. 222; Must be kept seperate as VMOVNTPS xmm. 223define void @merge_2_v4f32_align16_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind { 224; X86-LABEL: merge_2_v4f32_align16_ntstore: 225; X86: # %bb.0: 226; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 227; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 228; X86-NEXT: movaps (%ecx), %xmm0 229; X86-NEXT: movaps 16(%ecx), %xmm1 230; X86-NEXT: movntps %xmm0, (%eax) 231; X86-NEXT: movntps %xmm1, 16(%eax) 232; X86-NEXT: retl 233; 234; X64-SSE-LABEL: merge_2_v4f32_align16_ntstore: 235; X64-SSE: # %bb.0: 236; X64-SSE-NEXT: movaps (%rdi), %xmm0 237; X64-SSE-NEXT: movaps 16(%rdi), %xmm1 238; X64-SSE-NEXT: movntps %xmm0, (%rsi) 239; X64-SSE-NEXT: movntps %xmm1, 16(%rsi) 240; X64-SSE-NEXT: retq 241; 242; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore: 243; X64-AVX: # %bb.0: 244; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 245; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 246; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) 247; X64-AVX-NEXT: vmovntps %xmm1, 16(%rsi) 248; X64-AVX-NEXT: retq 249 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 250 %2 = bitcast float* %1 to <4 x float>* 251 %3 = load <4 x float>, <4 x float>* %a0, align 16 252 %4 = load <4 x float>, <4 x float>* %2, align 16 253 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 254 %6 = bitcast float* %5 to <4 x float>* 255 store <4 x float> %3, <4 x float>* %a1, align 16, !nontemporal !0 256 store <4 x float> %4, <4 x float>* %6, align 16, !nontemporal !0 257 ret void 258} 259 260; Nothing can perform NT-load-vector on 1-byte aligned memory. 261; Just perform regular loads. 262define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind { 263; X86-LABEL: merge_2_v4f32_align1_ntload: 264; X86: # %bb.0: 265; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 266; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 267; X86-NEXT: movups (%ecx), %xmm0 268; X86-NEXT: movups 16(%ecx), %xmm1 269; X86-NEXT: movups %xmm0, (%eax) 270; X86-NEXT: movups %xmm1, 16(%eax) 271; X86-NEXT: retl 272; 273; X64-SSE-LABEL: merge_2_v4f32_align1_ntload: 274; X64-SSE: # %bb.0: 275; X64-SSE-NEXT: movups (%rdi), %xmm0 276; X64-SSE-NEXT: movups 16(%rdi), %xmm1 277; X64-SSE-NEXT: movups %xmm0, (%rsi) 278; X64-SSE-NEXT: movups %xmm1, 16(%rsi) 279; X64-SSE-NEXT: retq 280; 281; X64-AVX-LABEL: merge_2_v4f32_align1_ntload: 282; X64-AVX: # %bb.0: 283; X64-AVX-NEXT: vmovups (%rdi), %ymm0 284; X64-AVX-NEXT: vmovups %ymm0, (%rsi) 285; X64-AVX-NEXT: vzeroupper 286; X64-AVX-NEXT: retq 287 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 288 %2 = bitcast float* %1 to <4 x float>* 289 %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0 290 %4 = load <4 x float>, <4 x float>* %2, align 1, !nontemporal !0 291 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 292 %6 = bitcast float* %5 to <4 x float>* 293 store <4 x float> %3, <4 x float>* %a1, align 1 294 store <4 x float> %4, <4 x float>* %6, align 1 295 ret void 296} 297 298; Nothing can perform NT-store-vector on 1-byte aligned memory. 299; Must be scalarized to use MOVTNI/MOVNTSD. 300define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind { 301; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore: 302; X86-SSE2: # %bb.0: 303; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 304; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 305; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 306; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 307; X86-SSE2-NEXT: movd %xmm0, %ecx 308; X86-SSE2-NEXT: movntil %ecx, (%eax) 309; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 310; X86-SSE2-NEXT: movd %xmm2, %ecx 311; X86-SSE2-NEXT: movntil %ecx, 12(%eax) 312; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 313; X86-SSE2-NEXT: movd %xmm2, %ecx 314; X86-SSE2-NEXT: movntil %ecx, 8(%eax) 315; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 316; X86-SSE2-NEXT: movd %xmm0, %ecx 317; X86-SSE2-NEXT: movntil %ecx, 4(%eax) 318; X86-SSE2-NEXT: movd %xmm1, %ecx 319; X86-SSE2-NEXT: movntil %ecx, 16(%eax) 320; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 321; X86-SSE2-NEXT: movd %xmm0, %ecx 322; X86-SSE2-NEXT: movntil %ecx, 28(%eax) 323; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 324; X86-SSE2-NEXT: movd %xmm0, %ecx 325; X86-SSE2-NEXT: movntil %ecx, 24(%eax) 326; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 327; X86-SSE2-NEXT: movd %xmm0, %ecx 328; X86-SSE2-NEXT: movntil %ecx, 20(%eax) 329; X86-SSE2-NEXT: retl 330; 331; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore: 332; X86-SSE4A: # %bb.0: 333; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax 334; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx 335; X86-SSE4A-NEXT: movups (%ecx), %xmm0 336; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 337; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 338; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero 339; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax) 340; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) 341; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) 342; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) 343; X86-SSE4A-NEXT: retl 344; 345; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore: 346; X64-SSE2: # %bb.0: 347; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 348; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 349; X64-SSE2-NEXT: movq %xmm0, %rax 350; X64-SSE2-NEXT: movntiq %rax, (%rsi) 351; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 352; X64-SSE2-NEXT: movq %xmm0, %rax 353; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) 354; X64-SSE2-NEXT: movq %xmm1, %rax 355; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) 356; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 357; X64-SSE2-NEXT: movq %xmm0, %rax 358; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) 359; X64-SSE2-NEXT: retq 360; 361; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore: 362; X64-SSE4A: # %bb.0: 363; X64-SSE4A-NEXT: movups (%rdi), %xmm0 364; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 365; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 366; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero 367; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi) 368; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) 369; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) 370; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) 371; X64-SSE4A-NEXT: retq 372; 373; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore: 374; X64-SSE41: # %bb.0: 375; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 376; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 377; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax 378; X64-SSE41-NEXT: movntiq %rax, 8(%rsi) 379; X64-SSE41-NEXT: movq %xmm0, %rax 380; X64-SSE41-NEXT: movntiq %rax, (%rsi) 381; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax 382; X64-SSE41-NEXT: movntiq %rax, 24(%rsi) 383; X64-SSE41-NEXT: movq %xmm1, %rax 384; X64-SSE41-NEXT: movntiq %rax, 16(%rsi) 385; X64-SSE41-NEXT: retq 386; 387; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore: 388; X64-AVX: # %bb.0: 389; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 390; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 391; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax 392; X64-AVX-NEXT: movntiq %rax, 8(%rsi) 393; X64-AVX-NEXT: vmovq %xmm0, %rax 394; X64-AVX-NEXT: movntiq %rax, (%rsi) 395; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax 396; X64-AVX-NEXT: movntiq %rax, 24(%rsi) 397; X64-AVX-NEXT: vmovq %xmm1, %rax 398; X64-AVX-NEXT: movntiq %rax, 16(%rsi) 399; X64-AVX-NEXT: retq 400 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 401 %2 = bitcast float* %1 to <4 x float>* 402 %3 = load <4 x float>, <4 x float>* %a0, align 1 403 %4 = load <4 x float>, <4 x float>* %2, align 1 404 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 405 %6 = bitcast float* %5 to <4 x float>* 406 store <4 x float> %3, <4 x float>* %a1, align 1, !nontemporal !0 407 store <4 x float> %4, <4 x float>* %6, align 1, !nontemporal !0 408 ret void 409} 410 411; Nothing can perform NT-load-vector on 1-byte aligned memory. 412; Just perform regular loads and scalarize NT-stores. 413define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { 414; X86-SSE2-LABEL: merge_2_v4f32_align1: 415; X86-SSE2: # %bb.0: 416; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 417; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 418; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 419; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 420; X86-SSE2-NEXT: movd %xmm0, %ecx 421; X86-SSE2-NEXT: movntil %ecx, (%eax) 422; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 423; X86-SSE2-NEXT: movd %xmm2, %ecx 424; X86-SSE2-NEXT: movntil %ecx, 12(%eax) 425; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 426; X86-SSE2-NEXT: movd %xmm2, %ecx 427; X86-SSE2-NEXT: movntil %ecx, 8(%eax) 428; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 429; X86-SSE2-NEXT: movd %xmm0, %ecx 430; X86-SSE2-NEXT: movntil %ecx, 4(%eax) 431; X86-SSE2-NEXT: movd %xmm1, %ecx 432; X86-SSE2-NEXT: movntil %ecx, 16(%eax) 433; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 434; X86-SSE2-NEXT: movd %xmm0, %ecx 435; X86-SSE2-NEXT: movntil %ecx, 28(%eax) 436; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 437; X86-SSE2-NEXT: movd %xmm0, %ecx 438; X86-SSE2-NEXT: movntil %ecx, 24(%eax) 439; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 440; X86-SSE2-NEXT: movd %xmm0, %ecx 441; X86-SSE2-NEXT: movntil %ecx, 20(%eax) 442; X86-SSE2-NEXT: retl 443; 444; X86-SSE4A-LABEL: merge_2_v4f32_align1: 445; X86-SSE4A: # %bb.0: 446; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax 447; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx 448; X86-SSE4A-NEXT: movups (%ecx), %xmm0 449; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 450; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 451; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero 452; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax) 453; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) 454; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) 455; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) 456; X86-SSE4A-NEXT: retl 457; 458; X64-SSE2-LABEL: merge_2_v4f32_align1: 459; X64-SSE2: # %bb.0: 460; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 461; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 462; X64-SSE2-NEXT: movq %xmm0, %rax 463; X64-SSE2-NEXT: movntiq %rax, (%rsi) 464; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 465; X64-SSE2-NEXT: movq %xmm0, %rax 466; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) 467; X64-SSE2-NEXT: movq %xmm1, %rax 468; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) 469; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 470; X64-SSE2-NEXT: movq %xmm0, %rax 471; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) 472; X64-SSE2-NEXT: retq 473; 474; X64-SSE4A-LABEL: merge_2_v4f32_align1: 475; X64-SSE4A: # %bb.0: 476; X64-SSE4A-NEXT: movups (%rdi), %xmm0 477; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 478; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero 479; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero 480; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi) 481; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) 482; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) 483; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) 484; X64-SSE4A-NEXT: retq 485; 486; X64-SSE41-LABEL: merge_2_v4f32_align1: 487; X64-SSE41: # %bb.0: 488; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 489; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 490; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax 491; X64-SSE41-NEXT: movntiq %rax, 8(%rsi) 492; X64-SSE41-NEXT: movq %xmm0, %rax 493; X64-SSE41-NEXT: movntiq %rax, (%rsi) 494; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax 495; X64-SSE41-NEXT: movntiq %rax, 24(%rsi) 496; X64-SSE41-NEXT: movq %xmm1, %rax 497; X64-SSE41-NEXT: movntiq %rax, 16(%rsi) 498; X64-SSE41-NEXT: retq 499; 500; X64-AVX-LABEL: merge_2_v4f32_align1: 501; X64-AVX: # %bb.0: 502; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 503; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 504; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax 505; X64-AVX-NEXT: movntiq %rax, 8(%rsi) 506; X64-AVX-NEXT: vmovq %xmm0, %rax 507; X64-AVX-NEXT: movntiq %rax, (%rsi) 508; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax 509; X64-AVX-NEXT: movntiq %rax, 24(%rsi) 510; X64-AVX-NEXT: vmovq %xmm1, %rax 511; X64-AVX-NEXT: movntiq %rax, 16(%rsi) 512; X64-AVX-NEXT: retq 513 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 514 %2 = bitcast float* %1 to <4 x float>* 515 %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0 516 %4 = load <4 x float>, <4 x float>* %2, align 1, !nontemporal !0 517 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 518 %6 = bitcast float* %5 to <4 x float>* 519 store <4 x float> %3, <4 x float>* %a1, align 1, !nontemporal !0 520 store <4 x float> %4, <4 x float>* %6, align 1, !nontemporal !0 521 ret void 522} 523 524!0 = !{i32 1} 525