1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 8 9; Test codegen for under aligned nontemporal vector loads 10 11; XMM versions. 12 13define <2 x double> @test_v2f64_align1(<2 x double>* %src) nounwind { 14; SSE-LABEL: test_v2f64_align1: 15; SSE: # %bb.0: 16; SSE-NEXT: movups (%rdi), %xmm0 17; SSE-NEXT: retq 18; 19; AVX-LABEL: test_v2f64_align1: 20; AVX: # %bb.0: 21; AVX-NEXT: vmovups (%rdi), %xmm0 22; AVX-NEXT: retq 23 %1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1 24 ret <2 x double> %1 25} 26 27define <4 x float> @test_v4f32_align1(<4 x float>* %src) nounwind { 28; SSE-LABEL: test_v4f32_align1: 29; SSE: # %bb.0: 30; SSE-NEXT: movups (%rdi), %xmm0 31; SSE-NEXT: retq 32; 33; AVX-LABEL: test_v4f32_align1: 34; AVX: # %bb.0: 35; AVX-NEXT: vmovups (%rdi), %xmm0 36; AVX-NEXT: retq 37 %1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1 38 ret <4 x float> %1 39} 40 41define <2 x i64> @test_v2i64_align1(<2 x i64>* %src) nounwind { 42; SSE-LABEL: test_v2i64_align1: 43; SSE: # %bb.0: 44; SSE-NEXT: movups (%rdi), %xmm0 45; SSE-NEXT: retq 46; 47; AVX-LABEL: test_v2i64_align1: 48; AVX: # %bb.0: 49; AVX-NEXT: vmovups (%rdi), %xmm0 50; AVX-NEXT: retq 51 %1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1 52 ret <2 x i64> %1 53} 54 55define <4 x i32> @test_v4i32_align1(<4 x i32>* %src) nounwind { 56; SSE-LABEL: test_v4i32_align1: 57; SSE: # %bb.0: 58; SSE-NEXT: movups (%rdi), %xmm0 59; SSE-NEXT: retq 60; 61; AVX-LABEL: test_v4i32_align1: 62; AVX: # %bb.0: 63; AVX-NEXT: vmovups (%rdi), %xmm0 64; AVX-NEXT: retq 65 %1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1 66 ret <4 x i32> %1 67} 68 69define <8 x i16> @test_v8i16_align1(<8 x i16>* %src) nounwind { 70; SSE-LABEL: test_v8i16_align1: 71; SSE: # %bb.0: 72; SSE-NEXT: movups (%rdi), %xmm0 73; SSE-NEXT: retq 74; 75; AVX-LABEL: test_v8i16_align1: 76; AVX: # %bb.0: 77; AVX-NEXT: vmovups (%rdi), %xmm0 78; AVX-NEXT: retq 79 %1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1 80 ret <8 x i16> %1 81} 82 83define <16 x i8> @test_v16i8_align1(<16 x i8>* %src) nounwind { 84; SSE-LABEL: test_v16i8_align1: 85; SSE: # %bb.0: 86; SSE-NEXT: movups (%rdi), %xmm0 87; SSE-NEXT: retq 88; 89; AVX-LABEL: test_v16i8_align1: 90; AVX: # %bb.0: 91; AVX-NEXT: vmovups (%rdi), %xmm0 92; AVX-NEXT: retq 93 %1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1 94 ret <16 x i8> %1 95} 96 97; YMM versions. 98 99define <4 x double> @test_v4f64_align1(<4 x double>* %src) nounwind { 100; SSE-LABEL: test_v4f64_align1: 101; SSE: # %bb.0: 102; SSE-NEXT: movups (%rdi), %xmm0 103; SSE-NEXT: movups 16(%rdi), %xmm1 104; SSE-NEXT: retq 105; 106; AVX-LABEL: test_v4f64_align1: 107; AVX: # %bb.0: 108; AVX-NEXT: vmovups (%rdi), %ymm0 109; AVX-NEXT: retq 110 %1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1 111 ret <4 x double> %1 112} 113 114define <8 x float> @test_v8f32_align1(<8 x float>* %src) nounwind { 115; SSE-LABEL: test_v8f32_align1: 116; SSE: # %bb.0: 117; SSE-NEXT: movups (%rdi), %xmm0 118; SSE-NEXT: movups 16(%rdi), %xmm1 119; SSE-NEXT: retq 120; 121; AVX-LABEL: test_v8f32_align1: 122; AVX: # %bb.0: 123; AVX-NEXT: vmovups (%rdi), %ymm0 124; AVX-NEXT: retq 125 %1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1 126 ret <8 x float> %1 127} 128 129define <4 x i64> @test_v4i64_align1(<4 x i64>* %src) nounwind { 130; SSE-LABEL: test_v4i64_align1: 131; SSE: # %bb.0: 132; SSE-NEXT: movups (%rdi), %xmm0 133; SSE-NEXT: movups 16(%rdi), %xmm1 134; SSE-NEXT: retq 135; 136; AVX-LABEL: test_v4i64_align1: 137; AVX: # %bb.0: 138; AVX-NEXT: vmovups (%rdi), %ymm0 139; AVX-NEXT: retq 140 %1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1 141 ret <4 x i64> %1 142} 143 144define <8 x i32> @test_v8i32_align1(<8 x i32>* %src) nounwind { 145; SSE-LABEL: test_v8i32_align1: 146; SSE: # %bb.0: 147; SSE-NEXT: movups (%rdi), %xmm0 148; SSE-NEXT: movups 16(%rdi), %xmm1 149; SSE-NEXT: retq 150; 151; AVX-LABEL: test_v8i32_align1: 152; AVX: # %bb.0: 153; AVX-NEXT: vmovups (%rdi), %ymm0 154; AVX-NEXT: retq 155 %1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1 156 ret <8 x i32> %1 157} 158 159define <16 x i16> @test_v16i16_align1(<16 x i16>* %src) nounwind { 160; SSE-LABEL: test_v16i16_align1: 161; SSE: # %bb.0: 162; SSE-NEXT: movups (%rdi), %xmm0 163; SSE-NEXT: movups 16(%rdi), %xmm1 164; SSE-NEXT: retq 165; 166; AVX-LABEL: test_v16i16_align1: 167; AVX: # %bb.0: 168; AVX-NEXT: vmovups (%rdi), %ymm0 169; AVX-NEXT: retq 170 %1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1 171 ret <16 x i16> %1 172} 173 174define <32 x i8> @test_v32i8_align1(<32 x i8>* %src) nounwind { 175; SSE-LABEL: test_v32i8_align1: 176; SSE: # %bb.0: 177; SSE-NEXT: movups (%rdi), %xmm0 178; SSE-NEXT: movups 16(%rdi), %xmm1 179; SSE-NEXT: retq 180; 181; AVX-LABEL: test_v32i8_align1: 182; AVX: # %bb.0: 183; AVX-NEXT: vmovups (%rdi), %ymm0 184; AVX-NEXT: retq 185 %1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1 186 ret <32 x i8> %1 187} 188 189define <4 x double> @test_v4f64_align16(<4 x double>* %src) nounwind { 190; SSE2-LABEL: test_v4f64_align16: 191; SSE2: # %bb.0: 192; SSE2-NEXT: movaps (%rdi), %xmm0 193; SSE2-NEXT: movaps 16(%rdi), %xmm1 194; SSE2-NEXT: retq 195; 196; SSE41-LABEL: test_v4f64_align16: 197; SSE41: # %bb.0: 198; SSE41-NEXT: movntdqa (%rdi), %xmm0 199; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 200; SSE41-NEXT: retq 201; 202; AVX-LABEL: test_v4f64_align16: 203; AVX: # %bb.0: 204; AVX-NEXT: pushq %rbp 205; AVX-NEXT: movq %rsp, %rbp 206; AVX-NEXT: andq $-32, %rsp 207; AVX-NEXT: subq $64, %rsp 208; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 209; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 210; AVX-NEXT: vmovntdqa (%rdi), %xmm0 211; AVX-NEXT: vmovdqa %xmm0, (%rsp) 212; AVX-NEXT: vmovaps (%rsp), %ymm0 213; AVX-NEXT: movq %rbp, %rsp 214; AVX-NEXT: popq %rbp 215; AVX-NEXT: retq 216 %1 = load <4 x double>, <4 x double>* %src, align 16, !nontemporal !1 217 ret <4 x double> %1 218} 219 220define <8 x float> @test_v8f32_align16(<8 x float>* %src) nounwind { 221; SSE2-LABEL: test_v8f32_align16: 222; SSE2: # %bb.0: 223; SSE2-NEXT: movaps (%rdi), %xmm0 224; SSE2-NEXT: movaps 16(%rdi), %xmm1 225; SSE2-NEXT: retq 226; 227; SSE41-LABEL: test_v8f32_align16: 228; SSE41: # %bb.0: 229; SSE41-NEXT: movntdqa (%rdi), %xmm0 230; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 231; SSE41-NEXT: retq 232; 233; AVX-LABEL: test_v8f32_align16: 234; AVX: # %bb.0: 235; AVX-NEXT: pushq %rbp 236; AVX-NEXT: movq %rsp, %rbp 237; AVX-NEXT: andq $-32, %rsp 238; AVX-NEXT: subq $64, %rsp 239; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 240; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 241; AVX-NEXT: vmovntdqa (%rdi), %xmm0 242; AVX-NEXT: vmovdqa %xmm0, (%rsp) 243; AVX-NEXT: vmovaps (%rsp), %ymm0 244; AVX-NEXT: movq %rbp, %rsp 245; AVX-NEXT: popq %rbp 246; AVX-NEXT: retq 247 %1 = load <8 x float>, <8 x float>* %src, align 16, !nontemporal !1 248 ret <8 x float> %1 249} 250 251define <4 x i64> @test_v4i64_align16(<4 x i64>* %src) nounwind { 252; SSE2-LABEL: test_v4i64_align16: 253; SSE2: # %bb.0: 254; SSE2-NEXT: movaps (%rdi), %xmm0 255; SSE2-NEXT: movaps 16(%rdi), %xmm1 256; SSE2-NEXT: retq 257; 258; SSE41-LABEL: test_v4i64_align16: 259; SSE41: # %bb.0: 260; SSE41-NEXT: movntdqa (%rdi), %xmm0 261; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 262; SSE41-NEXT: retq 263; 264; AVX-LABEL: test_v4i64_align16: 265; AVX: # %bb.0: 266; AVX-NEXT: pushq %rbp 267; AVX-NEXT: movq %rsp, %rbp 268; AVX-NEXT: andq $-32, %rsp 269; AVX-NEXT: subq $64, %rsp 270; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 271; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 272; AVX-NEXT: vmovntdqa (%rdi), %xmm0 273; AVX-NEXT: vmovdqa %xmm0, (%rsp) 274; AVX-NEXT: vmovaps (%rsp), %ymm0 275; AVX-NEXT: movq %rbp, %rsp 276; AVX-NEXT: popq %rbp 277; AVX-NEXT: retq 278 %1 = load <4 x i64>, <4 x i64>* %src, align 16, !nontemporal !1 279 ret <4 x i64> %1 280} 281 282define <8 x i32> @test_v8i32_align16(<8 x i32>* %src) nounwind { 283; SSE2-LABEL: test_v8i32_align16: 284; SSE2: # %bb.0: 285; SSE2-NEXT: movaps (%rdi), %xmm0 286; SSE2-NEXT: movaps 16(%rdi), %xmm1 287; SSE2-NEXT: retq 288; 289; SSE41-LABEL: test_v8i32_align16: 290; SSE41: # %bb.0: 291; SSE41-NEXT: movntdqa (%rdi), %xmm0 292; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 293; SSE41-NEXT: retq 294; 295; AVX-LABEL: test_v8i32_align16: 296; AVX: # %bb.0: 297; AVX-NEXT: pushq %rbp 298; AVX-NEXT: movq %rsp, %rbp 299; AVX-NEXT: andq $-32, %rsp 300; AVX-NEXT: subq $64, %rsp 301; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 302; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 303; AVX-NEXT: vmovntdqa (%rdi), %xmm0 304; AVX-NEXT: vmovdqa %xmm0, (%rsp) 305; AVX-NEXT: vmovaps (%rsp), %ymm0 306; AVX-NEXT: movq %rbp, %rsp 307; AVX-NEXT: popq %rbp 308; AVX-NEXT: retq 309 %1 = load <8 x i32>, <8 x i32>* %src, align 16, !nontemporal !1 310 ret <8 x i32> %1 311} 312 313define <16 x i16> @test_v16i16_align16(<16 x i16>* %src) nounwind { 314; SSE2-LABEL: test_v16i16_align16: 315; SSE2: # %bb.0: 316; SSE2-NEXT: movaps (%rdi), %xmm0 317; SSE2-NEXT: movaps 16(%rdi), %xmm1 318; SSE2-NEXT: retq 319; 320; SSE41-LABEL: test_v16i16_align16: 321; SSE41: # %bb.0: 322; SSE41-NEXT: movntdqa (%rdi), %xmm0 323; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 324; SSE41-NEXT: retq 325; 326; AVX-LABEL: test_v16i16_align16: 327; AVX: # %bb.0: 328; AVX-NEXT: pushq %rbp 329; AVX-NEXT: movq %rsp, %rbp 330; AVX-NEXT: andq $-32, %rsp 331; AVX-NEXT: subq $64, %rsp 332; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 333; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 334; AVX-NEXT: vmovntdqa (%rdi), %xmm0 335; AVX-NEXT: vmovdqa %xmm0, (%rsp) 336; AVX-NEXT: vmovaps (%rsp), %ymm0 337; AVX-NEXT: movq %rbp, %rsp 338; AVX-NEXT: popq %rbp 339; AVX-NEXT: retq 340 %1 = load <16 x i16>, <16 x i16>* %src, align 16, !nontemporal !1 341 ret <16 x i16> %1 342} 343 344define <32 x i8> @test_v32i8_align16(<32 x i8>* %src) nounwind { 345; SSE2-LABEL: test_v32i8_align16: 346; SSE2: # %bb.0: 347; SSE2-NEXT: movaps (%rdi), %xmm0 348; SSE2-NEXT: movaps 16(%rdi), %xmm1 349; SSE2-NEXT: retq 350; 351; SSE41-LABEL: test_v32i8_align16: 352; SSE41: # %bb.0: 353; SSE41-NEXT: movntdqa (%rdi), %xmm0 354; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 355; SSE41-NEXT: retq 356; 357; AVX-LABEL: test_v32i8_align16: 358; AVX: # %bb.0: 359; AVX-NEXT: pushq %rbp 360; AVX-NEXT: movq %rsp, %rbp 361; AVX-NEXT: andq $-32, %rsp 362; AVX-NEXT: subq $64, %rsp 363; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 364; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 365; AVX-NEXT: vmovntdqa (%rdi), %xmm0 366; AVX-NEXT: vmovdqa %xmm0, (%rsp) 367; AVX-NEXT: vmovaps (%rsp), %ymm0 368; AVX-NEXT: movq %rbp, %rsp 369; AVX-NEXT: popq %rbp 370; AVX-NEXT: retq 371 %1 = load <32 x i8>, <32 x i8>* %src, align 16, !nontemporal !1 372 ret <32 x i8> %1 373} 374 375; ZMM versions. 376 377define <8 x double> @test_v8f64_align1(<8 x double>* %src) nounwind { 378; SSE-LABEL: test_v8f64_align1: 379; SSE: # %bb.0: 380; SSE-NEXT: movups (%rdi), %xmm0 381; SSE-NEXT: movups 16(%rdi), %xmm1 382; SSE-NEXT: movups 32(%rdi), %xmm2 383; SSE-NEXT: movups 48(%rdi), %xmm3 384; SSE-NEXT: retq 385; 386; AVX1-LABEL: test_v8f64_align1: 387; AVX1: # %bb.0: 388; AVX1-NEXT: vmovups (%rdi), %ymm0 389; AVX1-NEXT: vmovups 32(%rdi), %ymm1 390; AVX1-NEXT: retq 391; 392; AVX2-LABEL: test_v8f64_align1: 393; AVX2: # %bb.0: 394; AVX2-NEXT: vmovups (%rdi), %ymm0 395; AVX2-NEXT: vmovups 32(%rdi), %ymm1 396; AVX2-NEXT: retq 397; 398; AVX512-LABEL: test_v8f64_align1: 399; AVX512: # %bb.0: 400; AVX512-NEXT: vmovups (%rdi), %zmm0 401; AVX512-NEXT: retq 402 %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1 403 ret <8 x double> %1 404} 405 406define <16 x float> @test_v16f32_align1(<16 x float>* %src) nounwind { 407; SSE-LABEL: test_v16f32_align1: 408; SSE: # %bb.0: 409; SSE-NEXT: movups (%rdi), %xmm0 410; SSE-NEXT: movups 16(%rdi), %xmm1 411; SSE-NEXT: movups 32(%rdi), %xmm2 412; SSE-NEXT: movups 48(%rdi), %xmm3 413; SSE-NEXT: retq 414; 415; AVX1-LABEL: test_v16f32_align1: 416; AVX1: # %bb.0: 417; AVX1-NEXT: vmovups (%rdi), %ymm0 418; AVX1-NEXT: vmovups 32(%rdi), %ymm1 419; AVX1-NEXT: retq 420; 421; AVX2-LABEL: test_v16f32_align1: 422; AVX2: # %bb.0: 423; AVX2-NEXT: vmovups (%rdi), %ymm0 424; AVX2-NEXT: vmovups 32(%rdi), %ymm1 425; AVX2-NEXT: retq 426; 427; AVX512-LABEL: test_v16f32_align1: 428; AVX512: # %bb.0: 429; AVX512-NEXT: vmovups (%rdi), %zmm0 430; AVX512-NEXT: retq 431 %1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1 432 ret <16 x float> %1 433} 434 435define <8 x i64> @test_v8i64_align1(<8 x i64>* %src) nounwind { 436; SSE-LABEL: test_v8i64_align1: 437; SSE: # %bb.0: 438; SSE-NEXT: movups (%rdi), %xmm0 439; SSE-NEXT: movups 16(%rdi), %xmm1 440; SSE-NEXT: movups 32(%rdi), %xmm2 441; SSE-NEXT: movups 48(%rdi), %xmm3 442; SSE-NEXT: retq 443; 444; AVX1-LABEL: test_v8i64_align1: 445; AVX1: # %bb.0: 446; AVX1-NEXT: vmovups (%rdi), %ymm0 447; AVX1-NEXT: vmovups 32(%rdi), %ymm1 448; AVX1-NEXT: retq 449; 450; AVX2-LABEL: test_v8i64_align1: 451; AVX2: # %bb.0: 452; AVX2-NEXT: vmovups (%rdi), %ymm0 453; AVX2-NEXT: vmovups 32(%rdi), %ymm1 454; AVX2-NEXT: retq 455; 456; AVX512-LABEL: test_v8i64_align1: 457; AVX512: # %bb.0: 458; AVX512-NEXT: vmovups (%rdi), %zmm0 459; AVX512-NEXT: retq 460 %1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1 461 ret <8 x i64> %1 462} 463 464define <16 x i32> @test_v16i32_align1(<16 x i32>* %src) nounwind { 465; SSE-LABEL: test_v16i32_align1: 466; SSE: # %bb.0: 467; SSE-NEXT: movups (%rdi), %xmm0 468; SSE-NEXT: movups 16(%rdi), %xmm1 469; SSE-NEXT: movups 32(%rdi), %xmm2 470; SSE-NEXT: movups 48(%rdi), %xmm3 471; SSE-NEXT: retq 472; 473; AVX1-LABEL: test_v16i32_align1: 474; AVX1: # %bb.0: 475; AVX1-NEXT: vmovups (%rdi), %ymm0 476; AVX1-NEXT: vmovups 32(%rdi), %ymm1 477; AVX1-NEXT: retq 478; 479; AVX2-LABEL: test_v16i32_align1: 480; AVX2: # %bb.0: 481; AVX2-NEXT: vmovups (%rdi), %ymm0 482; AVX2-NEXT: vmovups 32(%rdi), %ymm1 483; AVX2-NEXT: retq 484; 485; AVX512-LABEL: test_v16i32_align1: 486; AVX512: # %bb.0: 487; AVX512-NEXT: vmovups (%rdi), %zmm0 488; AVX512-NEXT: retq 489 %1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1 490 ret <16 x i32> %1 491} 492 493define <32 x i16> @test_v32i16_align1(<32 x i16>* %src) nounwind { 494; SSE-LABEL: test_v32i16_align1: 495; SSE: # %bb.0: 496; SSE-NEXT: movups (%rdi), %xmm0 497; SSE-NEXT: movups 16(%rdi), %xmm1 498; SSE-NEXT: movups 32(%rdi), %xmm2 499; SSE-NEXT: movups 48(%rdi), %xmm3 500; SSE-NEXT: retq 501; 502; AVX1-LABEL: test_v32i16_align1: 503; AVX1: # %bb.0: 504; AVX1-NEXT: vmovups (%rdi), %ymm0 505; AVX1-NEXT: vmovups 32(%rdi), %ymm1 506; AVX1-NEXT: retq 507; 508; AVX2-LABEL: test_v32i16_align1: 509; AVX2: # %bb.0: 510; AVX2-NEXT: vmovups (%rdi), %ymm0 511; AVX2-NEXT: vmovups 32(%rdi), %ymm1 512; AVX2-NEXT: retq 513; 514; AVX512-LABEL: test_v32i16_align1: 515; AVX512: # %bb.0: 516; AVX512-NEXT: vmovups (%rdi), %zmm0 517; AVX512-NEXT: retq 518 %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1 519 ret <32 x i16> %1 520} 521 522define <64 x i8> @test_v64i8_align1(<64 x i8>* %src) nounwind { 523; SSE-LABEL: test_v64i8_align1: 524; SSE: # %bb.0: 525; SSE-NEXT: movups (%rdi), %xmm0 526; SSE-NEXT: movups 16(%rdi), %xmm1 527; SSE-NEXT: movups 32(%rdi), %xmm2 528; SSE-NEXT: movups 48(%rdi), %xmm3 529; SSE-NEXT: retq 530; 531; AVX1-LABEL: test_v64i8_align1: 532; AVX1: # %bb.0: 533; AVX1-NEXT: vmovups (%rdi), %ymm0 534; AVX1-NEXT: vmovups 32(%rdi), %ymm1 535; AVX1-NEXT: retq 536; 537; AVX2-LABEL: test_v64i8_align1: 538; AVX2: # %bb.0: 539; AVX2-NEXT: vmovups (%rdi), %ymm0 540; AVX2-NEXT: vmovups 32(%rdi), %ymm1 541; AVX2-NEXT: retq 542; 543; AVX512-LABEL: test_v64i8_align1: 544; AVX512: # %bb.0: 545; AVX512-NEXT: vmovups (%rdi), %zmm0 546; AVX512-NEXT: retq 547 %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1 548 ret <64 x i8> %1 549} 550 551define <8 x double> @test_v8f64_align16(<8 x double>* %src) nounwind { 552; SSE2-LABEL: test_v8f64_align16: 553; SSE2: # %bb.0: 554; SSE2-NEXT: movaps (%rdi), %xmm0 555; SSE2-NEXT: movaps 16(%rdi), %xmm1 556; SSE2-NEXT: movaps 32(%rdi), %xmm2 557; SSE2-NEXT: movaps 48(%rdi), %xmm3 558; SSE2-NEXT: retq 559; 560; SSE41-LABEL: test_v8f64_align16: 561; SSE41: # %bb.0: 562; SSE41-NEXT: movntdqa (%rdi), %xmm0 563; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 564; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 565; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 566; SSE41-NEXT: retq 567; 568; AVX1-LABEL: test_v8f64_align16: 569; AVX1: # %bb.0: 570; AVX1-NEXT: pushq %rbp 571; AVX1-NEXT: movq %rsp, %rbp 572; AVX1-NEXT: andq $-32, %rsp 573; AVX1-NEXT: subq $96, %rsp 574; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 575; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 576; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 577; AVX1-NEXT: vmovdqa %xmm0, (%rsp) 578; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 579; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 580; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 581; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 582; AVX1-NEXT: vmovaps (%rsp), %ymm0 583; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 584; AVX1-NEXT: movq %rbp, %rsp 585; AVX1-NEXT: popq %rbp 586; AVX1-NEXT: retq 587; 588; AVX2-LABEL: test_v8f64_align16: 589; AVX2: # %bb.0: 590; AVX2-NEXT: pushq %rbp 591; AVX2-NEXT: movq %rsp, %rbp 592; AVX2-NEXT: andq $-32, %rsp 593; AVX2-NEXT: subq $96, %rsp 594; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 595; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 596; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 597; AVX2-NEXT: vmovdqa %xmm0, (%rsp) 598; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 599; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 600; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 601; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 602; AVX2-NEXT: vmovaps (%rsp), %ymm0 603; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 604; AVX2-NEXT: movq %rbp, %rsp 605; AVX2-NEXT: popq %rbp 606; AVX2-NEXT: retq 607; 608; AVX512-LABEL: test_v8f64_align16: 609; AVX512: # %bb.0: 610; AVX512-NEXT: pushq %rbp 611; AVX512-NEXT: movq %rsp, %rbp 612; AVX512-NEXT: andq $-64, %rsp 613; AVX512-NEXT: subq $128, %rsp 614; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 615; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 616; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 617; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 618; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 619; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 620; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 621; AVX512-NEXT: vmovdqa %xmm0, (%rsp) 622; AVX512-NEXT: vmovaps (%rsp), %zmm0 623; AVX512-NEXT: movq %rbp, %rsp 624; AVX512-NEXT: popq %rbp 625; AVX512-NEXT: retq 626 %1 = load <8 x double>, <8 x double>* %src, align 16, !nontemporal !1 627 ret <8 x double> %1 628} 629 630define <16 x float> @test_v16f32_align16(<16 x float>* %src) nounwind { 631; SSE2-LABEL: test_v16f32_align16: 632; SSE2: # %bb.0: 633; SSE2-NEXT: movaps (%rdi), %xmm0 634; SSE2-NEXT: movaps 16(%rdi), %xmm1 635; SSE2-NEXT: movaps 32(%rdi), %xmm2 636; SSE2-NEXT: movaps 48(%rdi), %xmm3 637; SSE2-NEXT: retq 638; 639; SSE41-LABEL: test_v16f32_align16: 640; SSE41: # %bb.0: 641; SSE41-NEXT: movntdqa (%rdi), %xmm0 642; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 643; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 644; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 645; SSE41-NEXT: retq 646; 647; AVX1-LABEL: test_v16f32_align16: 648; AVX1: # %bb.0: 649; AVX1-NEXT: pushq %rbp 650; AVX1-NEXT: movq %rsp, %rbp 651; AVX1-NEXT: andq $-32, %rsp 652; AVX1-NEXT: subq $96, %rsp 653; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 654; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 655; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 656; AVX1-NEXT: vmovdqa %xmm0, (%rsp) 657; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 658; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 659; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 660; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 661; AVX1-NEXT: vmovaps (%rsp), %ymm0 662; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 663; AVX1-NEXT: movq %rbp, %rsp 664; AVX1-NEXT: popq %rbp 665; AVX1-NEXT: retq 666; 667; AVX2-LABEL: test_v16f32_align16: 668; AVX2: # %bb.0: 669; AVX2-NEXT: pushq %rbp 670; AVX2-NEXT: movq %rsp, %rbp 671; AVX2-NEXT: andq $-32, %rsp 672; AVX2-NEXT: subq $96, %rsp 673; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 674; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 675; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 676; AVX2-NEXT: vmovdqa %xmm0, (%rsp) 677; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 678; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 679; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 680; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 681; AVX2-NEXT: vmovaps (%rsp), %ymm0 682; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 683; AVX2-NEXT: movq %rbp, %rsp 684; AVX2-NEXT: popq %rbp 685; AVX2-NEXT: retq 686; 687; AVX512-LABEL: test_v16f32_align16: 688; AVX512: # %bb.0: 689; AVX512-NEXT: pushq %rbp 690; AVX512-NEXT: movq %rsp, %rbp 691; AVX512-NEXT: andq $-64, %rsp 692; AVX512-NEXT: subq $128, %rsp 693; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 694; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 695; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 696; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 697; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 698; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 699; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 700; AVX512-NEXT: vmovdqa %xmm0, (%rsp) 701; AVX512-NEXT: vmovaps (%rsp), %zmm0 702; AVX512-NEXT: movq %rbp, %rsp 703; AVX512-NEXT: popq %rbp 704; AVX512-NEXT: retq 705 %1 = load <16 x float>, <16 x float>* %src, align 16, !nontemporal !1 706 ret <16 x float> %1 707} 708 709define <8 x i64> @test_v8i64_align16(<8 x i64>* %src) nounwind { 710; SSE2-LABEL: test_v8i64_align16: 711; SSE2: # %bb.0: 712; SSE2-NEXT: movaps (%rdi), %xmm0 713; SSE2-NEXT: movaps 16(%rdi), %xmm1 714; SSE2-NEXT: movaps 32(%rdi), %xmm2 715; SSE2-NEXT: movaps 48(%rdi), %xmm3 716; SSE2-NEXT: retq 717; 718; SSE41-LABEL: test_v8i64_align16: 719; SSE41: # %bb.0: 720; SSE41-NEXT: movntdqa (%rdi), %xmm0 721; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 722; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 723; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 724; SSE41-NEXT: retq 725; 726; AVX1-LABEL: test_v8i64_align16: 727; AVX1: # %bb.0: 728; AVX1-NEXT: pushq %rbp 729; AVX1-NEXT: movq %rsp, %rbp 730; AVX1-NEXT: andq $-32, %rsp 731; AVX1-NEXT: subq $96, %rsp 732; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 733; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 734; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 735; AVX1-NEXT: vmovdqa %xmm0, (%rsp) 736; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 737; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 738; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 739; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 740; AVX1-NEXT: vmovaps (%rsp), %ymm0 741; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 742; AVX1-NEXT: movq %rbp, %rsp 743; AVX1-NEXT: popq %rbp 744; AVX1-NEXT: retq 745; 746; AVX2-LABEL: test_v8i64_align16: 747; AVX2: # %bb.0: 748; AVX2-NEXT: pushq %rbp 749; AVX2-NEXT: movq %rsp, %rbp 750; AVX2-NEXT: andq $-32, %rsp 751; AVX2-NEXT: subq $96, %rsp 752; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 753; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 754; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 755; AVX2-NEXT: vmovdqa %xmm0, (%rsp) 756; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 757; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 758; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 759; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 760; AVX2-NEXT: vmovaps (%rsp), %ymm0 761; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 762; AVX2-NEXT: movq %rbp, %rsp 763; AVX2-NEXT: popq %rbp 764; AVX2-NEXT: retq 765; 766; AVX512-LABEL: test_v8i64_align16: 767; AVX512: # %bb.0: 768; AVX512-NEXT: pushq %rbp 769; AVX512-NEXT: movq %rsp, %rbp 770; AVX512-NEXT: andq $-64, %rsp 771; AVX512-NEXT: subq $128, %rsp 772; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 773; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 774; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 775; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 776; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 777; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 778; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 779; AVX512-NEXT: vmovdqa %xmm0, (%rsp) 780; AVX512-NEXT: vmovaps (%rsp), %zmm0 781; AVX512-NEXT: movq %rbp, %rsp 782; AVX512-NEXT: popq %rbp 783; AVX512-NEXT: retq 784 %1 = load <8 x i64>, <8 x i64>* %src, align 16, !nontemporal !1 785 ret <8 x i64> %1 786} 787 788define <16 x i32> @test_v16i32_align16(<16 x i32>* %src) nounwind { 789; SSE2-LABEL: test_v16i32_align16: 790; SSE2: # %bb.0: 791; SSE2-NEXT: movaps (%rdi), %xmm0 792; SSE2-NEXT: movaps 16(%rdi), %xmm1 793; SSE2-NEXT: movaps 32(%rdi), %xmm2 794; SSE2-NEXT: movaps 48(%rdi), %xmm3 795; SSE2-NEXT: retq 796; 797; SSE41-LABEL: test_v16i32_align16: 798; SSE41: # %bb.0: 799; SSE41-NEXT: movntdqa (%rdi), %xmm0 800; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 801; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 802; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 803; SSE41-NEXT: retq 804; 805; AVX1-LABEL: test_v16i32_align16: 806; AVX1: # %bb.0: 807; AVX1-NEXT: pushq %rbp 808; AVX1-NEXT: movq %rsp, %rbp 809; AVX1-NEXT: andq $-32, %rsp 810; AVX1-NEXT: subq $96, %rsp 811; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 812; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 813; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 814; AVX1-NEXT: vmovdqa %xmm0, (%rsp) 815; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 816; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 817; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 818; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 819; AVX1-NEXT: vmovaps (%rsp), %ymm0 820; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 821; AVX1-NEXT: movq %rbp, %rsp 822; AVX1-NEXT: popq %rbp 823; AVX1-NEXT: retq 824; 825; AVX2-LABEL: test_v16i32_align16: 826; AVX2: # %bb.0: 827; AVX2-NEXT: pushq %rbp 828; AVX2-NEXT: movq %rsp, %rbp 829; AVX2-NEXT: andq $-32, %rsp 830; AVX2-NEXT: subq $96, %rsp 831; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 832; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 833; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 834; AVX2-NEXT: vmovdqa %xmm0, (%rsp) 835; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 836; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 837; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 838; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 839; AVX2-NEXT: vmovaps (%rsp), %ymm0 840; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 841; AVX2-NEXT: movq %rbp, %rsp 842; AVX2-NEXT: popq %rbp 843; AVX2-NEXT: retq 844; 845; AVX512-LABEL: test_v16i32_align16: 846; AVX512: # %bb.0: 847; AVX512-NEXT: pushq %rbp 848; AVX512-NEXT: movq %rsp, %rbp 849; AVX512-NEXT: andq $-64, %rsp 850; AVX512-NEXT: subq $128, %rsp 851; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 852; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 853; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 854; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 855; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 856; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 857; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 858; AVX512-NEXT: vmovdqa %xmm0, (%rsp) 859; AVX512-NEXT: vmovaps (%rsp), %zmm0 860; AVX512-NEXT: movq %rbp, %rsp 861; AVX512-NEXT: popq %rbp 862; AVX512-NEXT: retq 863 %1 = load <16 x i32>, <16 x i32>* %src, align 16, !nontemporal !1 864 ret <16 x i32> %1 865} 866 867define <32 x i16> @test_v32i16_align16(<32 x i16>* %src) nounwind { 868; SSE2-LABEL: test_v32i16_align16: 869; SSE2: # %bb.0: 870; SSE2-NEXT: movaps (%rdi), %xmm0 871; SSE2-NEXT: movaps 16(%rdi), %xmm1 872; SSE2-NEXT: movaps 32(%rdi), %xmm2 873; SSE2-NEXT: movaps 48(%rdi), %xmm3 874; SSE2-NEXT: retq 875; 876; SSE41-LABEL: test_v32i16_align16: 877; SSE41: # %bb.0: 878; SSE41-NEXT: movntdqa (%rdi), %xmm0 879; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 880; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 881; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 882; SSE41-NEXT: retq 883; 884; AVX1-LABEL: test_v32i16_align16: 885; AVX1: # %bb.0: 886; AVX1-NEXT: pushq %rbp 887; AVX1-NEXT: movq %rsp, %rbp 888; AVX1-NEXT: andq $-32, %rsp 889; AVX1-NEXT: subq $96, %rsp 890; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 891; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 892; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 893; AVX1-NEXT: vmovdqa %xmm0, (%rsp) 894; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 895; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 896; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 897; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 898; AVX1-NEXT: vmovaps (%rsp), %ymm0 899; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 900; AVX1-NEXT: movq %rbp, %rsp 901; AVX1-NEXT: popq %rbp 902; AVX1-NEXT: retq 903; 904; AVX2-LABEL: test_v32i16_align16: 905; AVX2: # %bb.0: 906; AVX2-NEXT: pushq %rbp 907; AVX2-NEXT: movq %rsp, %rbp 908; AVX2-NEXT: andq $-32, %rsp 909; AVX2-NEXT: subq $96, %rsp 910; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 911; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 912; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 913; AVX2-NEXT: vmovdqa %xmm0, (%rsp) 914; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 915; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 916; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 917; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 918; AVX2-NEXT: vmovaps (%rsp), %ymm0 919; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 920; AVX2-NEXT: movq %rbp, %rsp 921; AVX2-NEXT: popq %rbp 922; AVX2-NEXT: retq 923; 924; AVX512-LABEL: test_v32i16_align16: 925; AVX512: # %bb.0: 926; AVX512-NEXT: pushq %rbp 927; AVX512-NEXT: movq %rsp, %rbp 928; AVX512-NEXT: andq $-64, %rsp 929; AVX512-NEXT: subq $128, %rsp 930; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 931; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 932; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 933; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 934; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 935; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 936; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 937; AVX512-NEXT: vmovdqa %xmm0, (%rsp) 938; AVX512-NEXT: vmovaps (%rsp), %zmm0 939; AVX512-NEXT: movq %rbp, %rsp 940; AVX512-NEXT: popq %rbp 941; AVX512-NEXT: retq 942 %1 = load <32 x i16>, <32 x i16>* %src, align 16, !nontemporal !1 943 ret <32 x i16> %1 944} 945 946define <64 x i8> @test_v64i8_align16(<64 x i8>* %src) nounwind { 947; SSE2-LABEL: test_v64i8_align16: 948; SSE2: # %bb.0: 949; SSE2-NEXT: movaps (%rdi), %xmm0 950; SSE2-NEXT: movaps 16(%rdi), %xmm1 951; SSE2-NEXT: movaps 32(%rdi), %xmm2 952; SSE2-NEXT: movaps 48(%rdi), %xmm3 953; SSE2-NEXT: retq 954; 955; SSE41-LABEL: test_v64i8_align16: 956; SSE41: # %bb.0: 957; SSE41-NEXT: movntdqa (%rdi), %xmm0 958; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 959; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 960; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 961; SSE41-NEXT: retq 962; 963; AVX1-LABEL: test_v64i8_align16: 964; AVX1: # %bb.0: 965; AVX1-NEXT: pushq %rbp 966; AVX1-NEXT: movq %rsp, %rbp 967; AVX1-NEXT: andq $-32, %rsp 968; AVX1-NEXT: subq $96, %rsp 969; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 970; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 971; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 972; AVX1-NEXT: vmovdqa %xmm0, (%rsp) 973; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 974; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 975; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 976; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 977; AVX1-NEXT: vmovaps (%rsp), %ymm0 978; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 979; AVX1-NEXT: movq %rbp, %rsp 980; AVX1-NEXT: popq %rbp 981; AVX1-NEXT: retq 982; 983; AVX2-LABEL: test_v64i8_align16: 984; AVX2: # %bb.0: 985; AVX2-NEXT: pushq %rbp 986; AVX2-NEXT: movq %rsp, %rbp 987; AVX2-NEXT: andq $-32, %rsp 988; AVX2-NEXT: subq $96, %rsp 989; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 990; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 991; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 992; AVX2-NEXT: vmovdqa %xmm0, (%rsp) 993; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 994; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 995; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 996; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 997; AVX2-NEXT: vmovaps (%rsp), %ymm0 998; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 999; AVX2-NEXT: movq %rbp, %rsp 1000; AVX2-NEXT: popq %rbp 1001; AVX2-NEXT: retq 1002; 1003; AVX512-LABEL: test_v64i8_align16: 1004; AVX512: # %bb.0: 1005; AVX512-NEXT: pushq %rbp 1006; AVX512-NEXT: movq %rsp, %rbp 1007; AVX512-NEXT: andq $-64, %rsp 1008; AVX512-NEXT: subq $128, %rsp 1009; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 1010; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 1011; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 1012; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 1013; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 1014; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) 1015; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 1016; AVX512-NEXT: vmovdqa %xmm0, (%rsp) 1017; AVX512-NEXT: vmovaps (%rsp), %zmm0 1018; AVX512-NEXT: movq %rbp, %rsp 1019; AVX512-NEXT: popq %rbp 1020; AVX512-NEXT: retq 1021 %1 = load <64 x i8>, <64 x i8>* %src, align 16, !nontemporal !1 1022 ret <64 x i8> %1 1023} 1024 1025define <8 x double> @test_v8f64_align32(<8 x double>* %src) nounwind { 1026; SSE2-LABEL: test_v8f64_align32: 1027; SSE2: # %bb.0: 1028; SSE2-NEXT: movaps (%rdi), %xmm0 1029; SSE2-NEXT: movaps 16(%rdi), %xmm1 1030; SSE2-NEXT: movaps 32(%rdi), %xmm2 1031; SSE2-NEXT: movaps 48(%rdi), %xmm3 1032; SSE2-NEXT: retq 1033; 1034; SSE41-LABEL: test_v8f64_align32: 1035; SSE41: # %bb.0: 1036; SSE41-NEXT: movntdqa (%rdi), %xmm0 1037; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 1038; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 1039; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 1040; SSE41-NEXT: retq 1041; 1042; AVX1-LABEL: test_v8f64_align32: 1043; AVX1: # %bb.0: 1044; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 1045; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 1046; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1047; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 1048; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 1049; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1050; AVX1-NEXT: retq 1051; 1052; AVX2-LABEL: test_v8f64_align32: 1053; AVX2: # %bb.0: 1054; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 1055; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 1056; AVX2-NEXT: retq 1057; 1058; AVX512-LABEL: test_v8f64_align32: 1059; AVX512: # %bb.0: 1060; AVX512-NEXT: pushq %rbp 1061; AVX512-NEXT: movq %rsp, %rbp 1062; AVX512-NEXT: andq $-64, %rsp 1063; AVX512-NEXT: subq $128, %rsp 1064; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 1065; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) 1066; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 1067; AVX512-NEXT: vmovdqa %ymm0, (%rsp) 1068; AVX512-NEXT: vmovaps (%rsp), %zmm0 1069; AVX512-NEXT: movq %rbp, %rsp 1070; AVX512-NEXT: popq %rbp 1071; AVX512-NEXT: retq 1072 %1 = load <8 x double>, <8 x double>* %src, align 32, !nontemporal !1 1073 ret <8 x double> %1 1074} 1075 1076define <16 x float> @test_v16f32_align32(<16 x float>* %src) nounwind { 1077; SSE2-LABEL: test_v16f32_align32: 1078; SSE2: # %bb.0: 1079; SSE2-NEXT: movaps (%rdi), %xmm0 1080; SSE2-NEXT: movaps 16(%rdi), %xmm1 1081; SSE2-NEXT: movaps 32(%rdi), %xmm2 1082; SSE2-NEXT: movaps 48(%rdi), %xmm3 1083; SSE2-NEXT: retq 1084; 1085; SSE41-LABEL: test_v16f32_align32: 1086; SSE41: # %bb.0: 1087; SSE41-NEXT: movntdqa (%rdi), %xmm0 1088; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 1089; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 1090; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 1091; SSE41-NEXT: retq 1092; 1093; AVX1-LABEL: test_v16f32_align32: 1094; AVX1: # %bb.0: 1095; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 1096; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 1097; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1098; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 1099; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 1100; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1101; AVX1-NEXT: retq 1102; 1103; AVX2-LABEL: test_v16f32_align32: 1104; AVX2: # %bb.0: 1105; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 1106; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 1107; AVX2-NEXT: retq 1108; 1109; AVX512-LABEL: test_v16f32_align32: 1110; AVX512: # %bb.0: 1111; AVX512-NEXT: pushq %rbp 1112; AVX512-NEXT: movq %rsp, %rbp 1113; AVX512-NEXT: andq $-64, %rsp 1114; AVX512-NEXT: subq $128, %rsp 1115; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 1116; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) 1117; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 1118; AVX512-NEXT: vmovdqa %ymm0, (%rsp) 1119; AVX512-NEXT: vmovaps (%rsp), %zmm0 1120; AVX512-NEXT: movq %rbp, %rsp 1121; AVX512-NEXT: popq %rbp 1122; AVX512-NEXT: retq 1123 %1 = load <16 x float>, <16 x float>* %src, align 32, !nontemporal !1 1124 ret <16 x float> %1 1125} 1126 1127define <8 x i64> @test_v8i64_align32(<8 x i64>* %src) nounwind { 1128; SSE2-LABEL: test_v8i64_align32: 1129; SSE2: # %bb.0: 1130; SSE2-NEXT: movaps (%rdi), %xmm0 1131; SSE2-NEXT: movaps 16(%rdi), %xmm1 1132; SSE2-NEXT: movaps 32(%rdi), %xmm2 1133; SSE2-NEXT: movaps 48(%rdi), %xmm3 1134; SSE2-NEXT: retq 1135; 1136; SSE41-LABEL: test_v8i64_align32: 1137; SSE41: # %bb.0: 1138; SSE41-NEXT: movntdqa (%rdi), %xmm0 1139; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 1140; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 1141; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 1142; SSE41-NEXT: retq 1143; 1144; AVX1-LABEL: test_v8i64_align32: 1145; AVX1: # %bb.0: 1146; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 1147; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 1148; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1149; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 1150; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 1151; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1152; AVX1-NEXT: retq 1153; 1154; AVX2-LABEL: test_v8i64_align32: 1155; AVX2: # %bb.0: 1156; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 1157; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 1158; AVX2-NEXT: retq 1159; 1160; AVX512-LABEL: test_v8i64_align32: 1161; AVX512: # %bb.0: 1162; AVX512-NEXT: pushq %rbp 1163; AVX512-NEXT: movq %rsp, %rbp 1164; AVX512-NEXT: andq $-64, %rsp 1165; AVX512-NEXT: subq $128, %rsp 1166; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 1167; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) 1168; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 1169; AVX512-NEXT: vmovdqa %ymm0, (%rsp) 1170; AVX512-NEXT: vmovaps (%rsp), %zmm0 1171; AVX512-NEXT: movq %rbp, %rsp 1172; AVX512-NEXT: popq %rbp 1173; AVX512-NEXT: retq 1174 %1 = load <8 x i64>, <8 x i64>* %src, align 32, !nontemporal !1 1175 ret <8 x i64> %1 1176} 1177 1178define <16 x i32> @test_v16i32_align32(<16 x i32>* %src) nounwind { 1179; SSE2-LABEL: test_v16i32_align32: 1180; SSE2: # %bb.0: 1181; SSE2-NEXT: movaps (%rdi), %xmm0 1182; SSE2-NEXT: movaps 16(%rdi), %xmm1 1183; SSE2-NEXT: movaps 32(%rdi), %xmm2 1184; SSE2-NEXT: movaps 48(%rdi), %xmm3 1185; SSE2-NEXT: retq 1186; 1187; SSE41-LABEL: test_v16i32_align32: 1188; SSE41: # %bb.0: 1189; SSE41-NEXT: movntdqa (%rdi), %xmm0 1190; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 1191; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 1192; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 1193; SSE41-NEXT: retq 1194; 1195; AVX1-LABEL: test_v16i32_align32: 1196; AVX1: # %bb.0: 1197; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 1198; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 1199; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1200; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 1201; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 1202; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1203; AVX1-NEXT: retq 1204; 1205; AVX2-LABEL: test_v16i32_align32: 1206; AVX2: # %bb.0: 1207; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 1208; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 1209; AVX2-NEXT: retq 1210; 1211; AVX512-LABEL: test_v16i32_align32: 1212; AVX512: # %bb.0: 1213; AVX512-NEXT: pushq %rbp 1214; AVX512-NEXT: movq %rsp, %rbp 1215; AVX512-NEXT: andq $-64, %rsp 1216; AVX512-NEXT: subq $128, %rsp 1217; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 1218; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) 1219; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 1220; AVX512-NEXT: vmovdqa %ymm0, (%rsp) 1221; AVX512-NEXT: vmovaps (%rsp), %zmm0 1222; AVX512-NEXT: movq %rbp, %rsp 1223; AVX512-NEXT: popq %rbp 1224; AVX512-NEXT: retq 1225 %1 = load <16 x i32>, <16 x i32>* %src, align 32, !nontemporal !1 1226 ret <16 x i32> %1 1227} 1228 1229define <32 x i16> @test_v32i16_align32(<32 x i16>* %src) nounwind { 1230; SSE2-LABEL: test_v32i16_align32: 1231; SSE2: # %bb.0: 1232; SSE2-NEXT: movaps (%rdi), %xmm0 1233; SSE2-NEXT: movaps 16(%rdi), %xmm1 1234; SSE2-NEXT: movaps 32(%rdi), %xmm2 1235; SSE2-NEXT: movaps 48(%rdi), %xmm3 1236; SSE2-NEXT: retq 1237; 1238; SSE41-LABEL: test_v32i16_align32: 1239; SSE41: # %bb.0: 1240; SSE41-NEXT: movntdqa (%rdi), %xmm0 1241; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 1242; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 1243; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 1244; SSE41-NEXT: retq 1245; 1246; AVX1-LABEL: test_v32i16_align32: 1247; AVX1: # %bb.0: 1248; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 1249; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 1250; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1251; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 1252; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 1253; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1254; AVX1-NEXT: retq 1255; 1256; AVX2-LABEL: test_v32i16_align32: 1257; AVX2: # %bb.0: 1258; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 1259; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 1260; AVX2-NEXT: retq 1261; 1262; AVX512-LABEL: test_v32i16_align32: 1263; AVX512: # %bb.0: 1264; AVX512-NEXT: pushq %rbp 1265; AVX512-NEXT: movq %rsp, %rbp 1266; AVX512-NEXT: andq $-64, %rsp 1267; AVX512-NEXT: subq $128, %rsp 1268; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 1269; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) 1270; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 1271; AVX512-NEXT: vmovdqa %ymm0, (%rsp) 1272; AVX512-NEXT: vmovaps (%rsp), %zmm0 1273; AVX512-NEXT: movq %rbp, %rsp 1274; AVX512-NEXT: popq %rbp 1275; AVX512-NEXT: retq 1276 %1 = load <32 x i16>, <32 x i16>* %src, align 32, !nontemporal !1 1277 ret <32 x i16> %1 1278} 1279 1280define <64 x i8> @test_v64i8_align32(<64 x i8>* %src) nounwind { 1281; SSE2-LABEL: test_v64i8_align32: 1282; SSE2: # %bb.0: 1283; SSE2-NEXT: movaps (%rdi), %xmm0 1284; SSE2-NEXT: movaps 16(%rdi), %xmm1 1285; SSE2-NEXT: movaps 32(%rdi), %xmm2 1286; SSE2-NEXT: movaps 48(%rdi), %xmm3 1287; SSE2-NEXT: retq 1288; 1289; SSE41-LABEL: test_v64i8_align32: 1290; SSE41: # %bb.0: 1291; SSE41-NEXT: movntdqa (%rdi), %xmm0 1292; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 1293; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 1294; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 1295; SSE41-NEXT: retq 1296; 1297; AVX1-LABEL: test_v64i8_align32: 1298; AVX1: # %bb.0: 1299; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 1300; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 1301; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1302; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1 1303; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 1304; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1305; AVX1-NEXT: retq 1306; 1307; AVX2-LABEL: test_v64i8_align32: 1308; AVX2: # %bb.0: 1309; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 1310; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 1311; AVX2-NEXT: retq 1312; 1313; AVX512-LABEL: test_v64i8_align32: 1314; AVX512: # %bb.0: 1315; AVX512-NEXT: pushq %rbp 1316; AVX512-NEXT: movq %rsp, %rbp 1317; AVX512-NEXT: andq $-64, %rsp 1318; AVX512-NEXT: subq $128, %rsp 1319; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 1320; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) 1321; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 1322; AVX512-NEXT: vmovdqa %ymm0, (%rsp) 1323; AVX512-NEXT: vmovaps (%rsp), %zmm0 1324; AVX512-NEXT: movq %rbp, %rsp 1325; AVX512-NEXT: popq %rbp 1326; AVX512-NEXT: retq 1327 %1 = load <64 x i8>, <64 x i8>* %src, align 32, !nontemporal !1 1328 ret <64 x i8> %1 1329} 1330 1331!1 = !{i32 1} 1332