1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=SSE,SSE4A 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 9 10; Test codegen for under aligned nontemporal vector stores 11 12; XMM versions. 13 14define void @test_zero_v2f64_align1(<2 x double>* %dst) nounwind { 15; SSE-LABEL: test_zero_v2f64_align1: 16; SSE: # %bb.0: 17; SSE-NEXT: xorl %eax, %eax 18; SSE-NEXT: movntiq %rax, 8(%rdi) 19; SSE-NEXT: movntiq %rax, (%rdi) 20; SSE-NEXT: retq 21; 22; AVX-LABEL: test_zero_v2f64_align1: 23; AVX: # %bb.0: 24; AVX-NEXT: xorl %eax, %eax 25; AVX-NEXT: movntiq %rax, 8(%rdi) 26; AVX-NEXT: movntiq %rax, (%rdi) 27; AVX-NEXT: retq 28; 29; AVX512-LABEL: test_zero_v2f64_align1: 30; AVX512: # %bb.0: 31; AVX512-NEXT: xorl %eax, %eax 32; AVX512-NEXT: movntiq %rax, 8(%rdi) 33; AVX512-NEXT: movntiq %rax, (%rdi) 34; AVX512-NEXT: retq 35 store <2 x double> zeroinitializer, <2 x double>* %dst, align 1, !nontemporal !1 36 ret void 37} 38 39define void @test_zero_v4f32_align1(<4 x float>* %dst) nounwind { 40; SSE-LABEL: test_zero_v4f32_align1: 41; SSE: # %bb.0: 42; SSE-NEXT: xorl %eax, %eax 43; SSE-NEXT: movntiq %rax, 8(%rdi) 44; SSE-NEXT: movntiq %rax, (%rdi) 45; SSE-NEXT: retq 46; 47; AVX-LABEL: test_zero_v4f32_align1: 48; AVX: # %bb.0: 49; AVX-NEXT: xorl %eax, %eax 50; AVX-NEXT: movntiq %rax, 8(%rdi) 51; AVX-NEXT: movntiq %rax, (%rdi) 52; AVX-NEXT: retq 53; 54; AVX512-LABEL: test_zero_v4f32_align1: 55; AVX512: # %bb.0: 56; AVX512-NEXT: xorl %eax, %eax 57; AVX512-NEXT: movntiq %rax, 8(%rdi) 58; AVX512-NEXT: movntiq %rax, (%rdi) 59; AVX512-NEXT: retq 60 store <4 x float> zeroinitializer, <4 x float>* %dst, align 1, !nontemporal !1 61 ret void 62} 63 64define void @test_zero_v2i64_align1(<2 x i64>* %dst) nounwind { 65; SSE-LABEL: test_zero_v2i64_align1: 66; SSE: # %bb.0: 67; SSE-NEXT: xorl %eax, %eax 68; SSE-NEXT: movntiq %rax, 8(%rdi) 69; SSE-NEXT: movntiq %rax, (%rdi) 70; SSE-NEXT: retq 71; 72; AVX-LABEL: test_zero_v2i64_align1: 73; AVX: # %bb.0: 74; AVX-NEXT: xorl %eax, %eax 75; AVX-NEXT: movntiq %rax, 8(%rdi) 76; AVX-NEXT: movntiq %rax, (%rdi) 77; AVX-NEXT: retq 78; 79; AVX512-LABEL: test_zero_v2i64_align1: 80; AVX512: # %bb.0: 81; AVX512-NEXT: xorl %eax, %eax 82; AVX512-NEXT: movntiq %rax, 8(%rdi) 83; AVX512-NEXT: movntiq %rax, (%rdi) 84; AVX512-NEXT: retq 85 store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 1, !nontemporal !1 86 ret void 87} 88 89define void @test_zero_v4i32_align1(<4 x i32>* %dst) nounwind { 90; SSE-LABEL: test_zero_v4i32_align1: 91; SSE: # %bb.0: 92; SSE-NEXT: xorl %eax, %eax 93; SSE-NEXT: movntiq %rax, 8(%rdi) 94; SSE-NEXT: movntiq %rax, (%rdi) 95; SSE-NEXT: retq 96; 97; AVX-LABEL: test_zero_v4i32_align1: 98; AVX: # %bb.0: 99; AVX-NEXT: xorl %eax, %eax 100; AVX-NEXT: movntiq %rax, 8(%rdi) 101; AVX-NEXT: movntiq %rax, (%rdi) 102; AVX-NEXT: retq 103; 104; AVX512-LABEL: test_zero_v4i32_align1: 105; AVX512: # %bb.0: 106; AVX512-NEXT: xorl %eax, %eax 107; AVX512-NEXT: movntiq %rax, 8(%rdi) 108; AVX512-NEXT: movntiq %rax, (%rdi) 109; AVX512-NEXT: retq 110 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 1, !nontemporal !1 111 ret void 112} 113 114define void @test_zero_v8i16_align1(<8 x i16>* %dst) nounwind { 115; SSE-LABEL: test_zero_v8i16_align1: 116; SSE: # %bb.0: 117; SSE-NEXT: xorl %eax, %eax 118; SSE-NEXT: movntiq %rax, 8(%rdi) 119; SSE-NEXT: movntiq %rax, (%rdi) 120; SSE-NEXT: retq 121; 122; AVX-LABEL: test_zero_v8i16_align1: 123; AVX: # %bb.0: 124; AVX-NEXT: xorl %eax, %eax 125; AVX-NEXT: movntiq %rax, 8(%rdi) 126; AVX-NEXT: movntiq %rax, (%rdi) 127; AVX-NEXT: retq 128; 129; AVX512-LABEL: test_zero_v8i16_align1: 130; AVX512: # %bb.0: 131; AVX512-NEXT: xorl %eax, %eax 132; AVX512-NEXT: movntiq %rax, 8(%rdi) 133; AVX512-NEXT: movntiq %rax, (%rdi) 134; AVX512-NEXT: retq 135 store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 1, !nontemporal !1 136 ret void 137} 138 139define void @test_zero_v16i8_align1(<16 x i8>* %dst) nounwind { 140; SSE-LABEL: test_zero_v16i8_align1: 141; SSE: # %bb.0: 142; SSE-NEXT: xorl %eax, %eax 143; SSE-NEXT: movntiq %rax, 8(%rdi) 144; SSE-NEXT: movntiq %rax, (%rdi) 145; SSE-NEXT: retq 146; 147; AVX-LABEL: test_zero_v16i8_align1: 148; AVX: # %bb.0: 149; AVX-NEXT: xorl %eax, %eax 150; AVX-NEXT: movntiq %rax, 8(%rdi) 151; AVX-NEXT: movntiq %rax, (%rdi) 152; AVX-NEXT: retq 153; 154; AVX512-LABEL: test_zero_v16i8_align1: 155; AVX512: # %bb.0: 156; AVX512-NEXT: xorl %eax, %eax 157; AVX512-NEXT: movntiq %rax, 8(%rdi) 158; AVX512-NEXT: movntiq %rax, (%rdi) 159; AVX512-NEXT: retq 160 store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 1, !nontemporal !1 161 ret void 162} 163 164; YMM versions. 165 166define void @test_zero_v4f64_align1(<4 x double>* %dst) nounwind { 167; SSE-LABEL: test_zero_v4f64_align1: 168; SSE: # %bb.0: 169; SSE-NEXT: xorl %eax, %eax 170; SSE-NEXT: movntiq %rax, 8(%rdi) 171; SSE-NEXT: movntiq %rax, (%rdi) 172; SSE-NEXT: movntiq %rax, 24(%rdi) 173; SSE-NEXT: movntiq %rax, 16(%rdi) 174; SSE-NEXT: retq 175; 176; AVX-LABEL: test_zero_v4f64_align1: 177; AVX: # %bb.0: 178; AVX-NEXT: xorl %eax, %eax 179; AVX-NEXT: movntiq %rax, 8(%rdi) 180; AVX-NEXT: movntiq %rax, (%rdi) 181; AVX-NEXT: movntiq %rax, 24(%rdi) 182; AVX-NEXT: movntiq %rax, 16(%rdi) 183; AVX-NEXT: retq 184; 185; AVX512-LABEL: test_zero_v4f64_align1: 186; AVX512: # %bb.0: 187; AVX512-NEXT: xorl %eax, %eax 188; AVX512-NEXT: movntiq %rax, 8(%rdi) 189; AVX512-NEXT: movntiq %rax, (%rdi) 190; AVX512-NEXT: movntiq %rax, 24(%rdi) 191; AVX512-NEXT: movntiq %rax, 16(%rdi) 192; AVX512-NEXT: retq 193 store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1 194 ret void 195} 196 197define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind { 198; SSE2-LABEL: test_zero_v8f32_align1: 199; SSE2: # %bb.0: 200; SSE2-NEXT: xorl %eax, %eax 201; SSE2-NEXT: movntiq %rax, 8(%rdi) 202; SSE2-NEXT: movntiq %rax, (%rdi) 203; SSE2-NEXT: movntiq %rax, 24(%rdi) 204; SSE2-NEXT: movntiq %rax, 16(%rdi) 205; SSE2-NEXT: retq 206; 207; SSE4A-LABEL: test_zero_v8f32_align1: 208; SSE4A: # %bb.0: 209; SSE4A-NEXT: xorl %eax, %eax 210; SSE4A-NEXT: movntiq %rax, 8(%rdi) 211; SSE4A-NEXT: movntiq %rax, 24(%rdi) 212; SSE4A-NEXT: xorps %xmm0, %xmm0 213; SSE4A-NEXT: movntsd %xmm0, (%rdi) 214; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 215; SSE4A-NEXT: retq 216; 217; SSE41-LABEL: test_zero_v8f32_align1: 218; SSE41: # %bb.0: 219; SSE41-NEXT: xorl %eax, %eax 220; SSE41-NEXT: movntiq %rax, 8(%rdi) 221; SSE41-NEXT: movntiq %rax, (%rdi) 222; SSE41-NEXT: movntiq %rax, 24(%rdi) 223; SSE41-NEXT: movntiq %rax, 16(%rdi) 224; SSE41-NEXT: retq 225; 226; AVX-LABEL: test_zero_v8f32_align1: 227; AVX: # %bb.0: 228; AVX-NEXT: xorl %eax, %eax 229; AVX-NEXT: movntiq %rax, 8(%rdi) 230; AVX-NEXT: movntiq %rax, (%rdi) 231; AVX-NEXT: movntiq %rax, 24(%rdi) 232; AVX-NEXT: movntiq %rax, 16(%rdi) 233; AVX-NEXT: retq 234; 235; AVX512-LABEL: test_zero_v8f32_align1: 236; AVX512: # %bb.0: 237; AVX512-NEXT: xorl %eax, %eax 238; AVX512-NEXT: movntiq %rax, 8(%rdi) 239; AVX512-NEXT: movntiq %rax, (%rdi) 240; AVX512-NEXT: movntiq %rax, 24(%rdi) 241; AVX512-NEXT: movntiq %rax, 16(%rdi) 242; AVX512-NEXT: retq 243 store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1 244 ret void 245} 246 247define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind { 248; SSE2-LABEL: test_zero_v4i64_align1: 249; SSE2: # %bb.0: 250; SSE2-NEXT: xorl %eax, %eax 251; SSE2-NEXT: movntiq %rax, 8(%rdi) 252; SSE2-NEXT: movntiq %rax, (%rdi) 253; SSE2-NEXT: movntiq %rax, 24(%rdi) 254; SSE2-NEXT: movntiq %rax, 16(%rdi) 255; SSE2-NEXT: retq 256; 257; SSE4A-LABEL: test_zero_v4i64_align1: 258; SSE4A: # %bb.0: 259; SSE4A-NEXT: xorps %xmm0, %xmm0 260; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) 261; SSE4A-NEXT: movntsd %xmm0, (%rdi) 262; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) 263; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 264; SSE4A-NEXT: retq 265; 266; SSE41-LABEL: test_zero_v4i64_align1: 267; SSE41: # %bb.0: 268; SSE41-NEXT: xorl %eax, %eax 269; SSE41-NEXT: movntiq %rax, 8(%rdi) 270; SSE41-NEXT: movntiq %rax, (%rdi) 271; SSE41-NEXT: movntiq %rax, 24(%rdi) 272; SSE41-NEXT: movntiq %rax, 16(%rdi) 273; SSE41-NEXT: retq 274; 275; AVX-LABEL: test_zero_v4i64_align1: 276; AVX: # %bb.0: 277; AVX-NEXT: xorl %eax, %eax 278; AVX-NEXT: movntiq %rax, 8(%rdi) 279; AVX-NEXT: movntiq %rax, (%rdi) 280; AVX-NEXT: movntiq %rax, 24(%rdi) 281; AVX-NEXT: movntiq %rax, 16(%rdi) 282; AVX-NEXT: retq 283; 284; AVX512-LABEL: test_zero_v4i64_align1: 285; AVX512: # %bb.0: 286; AVX512-NEXT: xorl %eax, %eax 287; AVX512-NEXT: movntiq %rax, 8(%rdi) 288; AVX512-NEXT: movntiq %rax, (%rdi) 289; AVX512-NEXT: movntiq %rax, 24(%rdi) 290; AVX512-NEXT: movntiq %rax, 16(%rdi) 291; AVX512-NEXT: retq 292 store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1 293 ret void 294} 295 296define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind { 297; SSE2-LABEL: test_zero_v8i32_align1: 298; SSE2: # %bb.0: 299; SSE2-NEXT: xorl %eax, %eax 300; SSE2-NEXT: movntiq %rax, 8(%rdi) 301; SSE2-NEXT: movntiq %rax, (%rdi) 302; SSE2-NEXT: movntiq %rax, 24(%rdi) 303; SSE2-NEXT: movntiq %rax, 16(%rdi) 304; SSE2-NEXT: retq 305; 306; SSE4A-LABEL: test_zero_v8i32_align1: 307; SSE4A: # %bb.0: 308; SSE4A-NEXT: xorps %xmm0, %xmm0 309; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) 310; SSE4A-NEXT: movntsd %xmm0, (%rdi) 311; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) 312; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 313; SSE4A-NEXT: retq 314; 315; SSE41-LABEL: test_zero_v8i32_align1: 316; SSE41: # %bb.0: 317; SSE41-NEXT: xorl %eax, %eax 318; SSE41-NEXT: movntiq %rax, 8(%rdi) 319; SSE41-NEXT: movntiq %rax, (%rdi) 320; SSE41-NEXT: movntiq %rax, 24(%rdi) 321; SSE41-NEXT: movntiq %rax, 16(%rdi) 322; SSE41-NEXT: retq 323; 324; AVX-LABEL: test_zero_v8i32_align1: 325; AVX: # %bb.0: 326; AVX-NEXT: xorl %eax, %eax 327; AVX-NEXT: movntiq %rax, 8(%rdi) 328; AVX-NEXT: movntiq %rax, (%rdi) 329; AVX-NEXT: movntiq %rax, 24(%rdi) 330; AVX-NEXT: movntiq %rax, 16(%rdi) 331; AVX-NEXT: retq 332; 333; AVX512-LABEL: test_zero_v8i32_align1: 334; AVX512: # %bb.0: 335; AVX512-NEXT: xorl %eax, %eax 336; AVX512-NEXT: movntiq %rax, 8(%rdi) 337; AVX512-NEXT: movntiq %rax, (%rdi) 338; AVX512-NEXT: movntiq %rax, 24(%rdi) 339; AVX512-NEXT: movntiq %rax, 16(%rdi) 340; AVX512-NEXT: retq 341 store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1 342 ret void 343} 344 345define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind { 346; SSE2-LABEL: test_zero_v16i16_align1: 347; SSE2: # %bb.0: 348; SSE2-NEXT: xorl %eax, %eax 349; SSE2-NEXT: movntiq %rax, 8(%rdi) 350; SSE2-NEXT: movntiq %rax, (%rdi) 351; SSE2-NEXT: movntiq %rax, 24(%rdi) 352; SSE2-NEXT: movntiq %rax, 16(%rdi) 353; SSE2-NEXT: retq 354; 355; SSE4A-LABEL: test_zero_v16i16_align1: 356; SSE4A: # %bb.0: 357; SSE4A-NEXT: xorps %xmm0, %xmm0 358; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) 359; SSE4A-NEXT: movntsd %xmm0, (%rdi) 360; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) 361; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 362; SSE4A-NEXT: retq 363; 364; SSE41-LABEL: test_zero_v16i16_align1: 365; SSE41: # %bb.0: 366; SSE41-NEXT: xorl %eax, %eax 367; SSE41-NEXT: movntiq %rax, 8(%rdi) 368; SSE41-NEXT: movntiq %rax, (%rdi) 369; SSE41-NEXT: movntiq %rax, 24(%rdi) 370; SSE41-NEXT: movntiq %rax, 16(%rdi) 371; SSE41-NEXT: retq 372; 373; AVX-LABEL: test_zero_v16i16_align1: 374; AVX: # %bb.0: 375; AVX-NEXT: xorl %eax, %eax 376; AVX-NEXT: movntiq %rax, 8(%rdi) 377; AVX-NEXT: movntiq %rax, (%rdi) 378; AVX-NEXT: movntiq %rax, 24(%rdi) 379; AVX-NEXT: movntiq %rax, 16(%rdi) 380; AVX-NEXT: retq 381; 382; AVX512-LABEL: test_zero_v16i16_align1: 383; AVX512: # %bb.0: 384; AVX512-NEXT: xorl %eax, %eax 385; AVX512-NEXT: movntiq %rax, 8(%rdi) 386; AVX512-NEXT: movntiq %rax, (%rdi) 387; AVX512-NEXT: movntiq %rax, 24(%rdi) 388; AVX512-NEXT: movntiq %rax, 16(%rdi) 389; AVX512-NEXT: retq 390 store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1 391 ret void 392} 393 394define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind { 395; SSE2-LABEL: test_zero_v32i8_align1: 396; SSE2: # %bb.0: 397; SSE2-NEXT: xorl %eax, %eax 398; SSE2-NEXT: movntiq %rax, 8(%rdi) 399; SSE2-NEXT: movntiq %rax, (%rdi) 400; SSE2-NEXT: movntiq %rax, 24(%rdi) 401; SSE2-NEXT: movntiq %rax, 16(%rdi) 402; SSE2-NEXT: retq 403; 404; SSE4A-LABEL: test_zero_v32i8_align1: 405; SSE4A: # %bb.0: 406; SSE4A-NEXT: xorps %xmm0, %xmm0 407; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) 408; SSE4A-NEXT: movntsd %xmm0, (%rdi) 409; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) 410; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 411; SSE4A-NEXT: retq 412; 413; SSE41-LABEL: test_zero_v32i8_align1: 414; SSE41: # %bb.0: 415; SSE41-NEXT: xorl %eax, %eax 416; SSE41-NEXT: movntiq %rax, 8(%rdi) 417; SSE41-NEXT: movntiq %rax, (%rdi) 418; SSE41-NEXT: movntiq %rax, 24(%rdi) 419; SSE41-NEXT: movntiq %rax, 16(%rdi) 420; SSE41-NEXT: retq 421; 422; AVX-LABEL: test_zero_v32i8_align1: 423; AVX: # %bb.0: 424; AVX-NEXT: xorl %eax, %eax 425; AVX-NEXT: movntiq %rax, 8(%rdi) 426; AVX-NEXT: movntiq %rax, (%rdi) 427; AVX-NEXT: movntiq %rax, 24(%rdi) 428; AVX-NEXT: movntiq %rax, 16(%rdi) 429; AVX-NEXT: retq 430; 431; AVX512-LABEL: test_zero_v32i8_align1: 432; AVX512: # %bb.0: 433; AVX512-NEXT: xorl %eax, %eax 434; AVX512-NEXT: movntiq %rax, 8(%rdi) 435; AVX512-NEXT: movntiq %rax, (%rdi) 436; AVX512-NEXT: movntiq %rax, 24(%rdi) 437; AVX512-NEXT: movntiq %rax, 16(%rdi) 438; AVX512-NEXT: retq 439 store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1 440 ret void 441} 442 443define void @test_zero_v4f64_align16(<4 x double>* %dst) nounwind { 444; SSE-LABEL: test_zero_v4f64_align16: 445; SSE: # %bb.0: 446; SSE-NEXT: xorps %xmm0, %xmm0 447; SSE-NEXT: movntps %xmm0, 16(%rdi) 448; SSE-NEXT: movntps %xmm0, (%rdi) 449; SSE-NEXT: retq 450; 451; AVX-LABEL: test_zero_v4f64_align16: 452; AVX: # %bb.0: 453; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 454; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 455; AVX-NEXT: vmovntps %xmm0, (%rdi) 456; AVX-NEXT: retq 457; 458; AVX512-LABEL: test_zero_v4f64_align16: 459; AVX512: # %bb.0: 460; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 461; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 462; AVX512-NEXT: vmovntps %xmm0, (%rdi) 463; AVX512-NEXT: retq 464 store <4 x double> zeroinitializer, <4 x double>* %dst, align 16, !nontemporal !1 465 ret void 466} 467 468define void @test_zero_v8f32_align16(<8 x float>* %dst) nounwind { 469; SSE-LABEL: test_zero_v8f32_align16: 470; SSE: # %bb.0: 471; SSE-NEXT: xorps %xmm0, %xmm0 472; SSE-NEXT: movntps %xmm0, 16(%rdi) 473; SSE-NEXT: movntps %xmm0, (%rdi) 474; SSE-NEXT: retq 475; 476; AVX-LABEL: test_zero_v8f32_align16: 477; AVX: # %bb.0: 478; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 479; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 480; AVX-NEXT: vmovntps %xmm0, (%rdi) 481; AVX-NEXT: retq 482; 483; AVX512-LABEL: test_zero_v8f32_align16: 484; AVX512: # %bb.0: 485; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 486; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 487; AVX512-NEXT: vmovntps %xmm0, (%rdi) 488; AVX512-NEXT: retq 489 store <8 x float> zeroinitializer, <8 x float>* %dst, align 16, !nontemporal !1 490 ret void 491} 492 493define void @test_zero_v4i64_align16(<4 x i64>* %dst) nounwind { 494; SSE-LABEL: test_zero_v4i64_align16: 495; SSE: # %bb.0: 496; SSE-NEXT: xorps %xmm0, %xmm0 497; SSE-NEXT: movntps %xmm0, 16(%rdi) 498; SSE-NEXT: movntps %xmm0, (%rdi) 499; SSE-NEXT: retq 500; 501; AVX-LABEL: test_zero_v4i64_align16: 502; AVX: # %bb.0: 503; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 504; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 505; AVX-NEXT: vmovntps %xmm0, (%rdi) 506; AVX-NEXT: retq 507; 508; AVX512-LABEL: test_zero_v4i64_align16: 509; AVX512: # %bb.0: 510; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 511; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 512; AVX512-NEXT: vmovntps %xmm0, (%rdi) 513; AVX512-NEXT: retq 514 store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 16, !nontemporal !1 515 ret void 516} 517 518define void @test_zero_v8i32_align16(<8 x i32>* %dst) nounwind { 519; SSE-LABEL: test_zero_v8i32_align16: 520; SSE: # %bb.0: 521; SSE-NEXT: xorps %xmm0, %xmm0 522; SSE-NEXT: movntps %xmm0, 16(%rdi) 523; SSE-NEXT: movntps %xmm0, (%rdi) 524; SSE-NEXT: retq 525; 526; AVX-LABEL: test_zero_v8i32_align16: 527; AVX: # %bb.0: 528; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 529; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 530; AVX-NEXT: vmovntps %xmm0, (%rdi) 531; AVX-NEXT: retq 532; 533; AVX512-LABEL: test_zero_v8i32_align16: 534; AVX512: # %bb.0: 535; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 536; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 537; AVX512-NEXT: vmovntps %xmm0, (%rdi) 538; AVX512-NEXT: retq 539 store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 16, !nontemporal !1 540 ret void 541} 542 543define void @test_zero_v16i16_align16(<16 x i16>* %dst) nounwind { 544; SSE-LABEL: test_zero_v16i16_align16: 545; SSE: # %bb.0: 546; SSE-NEXT: xorps %xmm0, %xmm0 547; SSE-NEXT: movntps %xmm0, 16(%rdi) 548; SSE-NEXT: movntps %xmm0, (%rdi) 549; SSE-NEXT: retq 550; 551; AVX-LABEL: test_zero_v16i16_align16: 552; AVX: # %bb.0: 553; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 554; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 555; AVX-NEXT: vmovntps %xmm0, (%rdi) 556; AVX-NEXT: retq 557; 558; AVX512-LABEL: test_zero_v16i16_align16: 559; AVX512: # %bb.0: 560; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 561; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 562; AVX512-NEXT: vmovntps %xmm0, (%rdi) 563; AVX512-NEXT: retq 564 store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 16, !nontemporal !1 565 ret void 566} 567 568define void @test_zero_v32i8_align16(<32 x i8>* %dst) nounwind { 569; SSE-LABEL: test_zero_v32i8_align16: 570; SSE: # %bb.0: 571; SSE-NEXT: xorps %xmm0, %xmm0 572; SSE-NEXT: movntps %xmm0, 16(%rdi) 573; SSE-NEXT: movntps %xmm0, (%rdi) 574; SSE-NEXT: retq 575; 576; AVX-LABEL: test_zero_v32i8_align16: 577; AVX: # %bb.0: 578; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 579; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 580; AVX-NEXT: vmovntps %xmm0, (%rdi) 581; AVX-NEXT: retq 582; 583; AVX512-LABEL: test_zero_v32i8_align16: 584; AVX512: # %bb.0: 585; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 586; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 587; AVX512-NEXT: vmovntps %xmm0, (%rdi) 588; AVX512-NEXT: retq 589 store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 16, !nontemporal !1 590 ret void 591} 592 593; ZMM versions. 594 595define void @test_zero_v8f64_align1(<8 x double>* %dst) nounwind { 596; SSE-LABEL: test_zero_v8f64_align1: 597; SSE: # %bb.0: 598; SSE-NEXT: xorl %eax, %eax 599; SSE-NEXT: movntiq %rax, 24(%rdi) 600; SSE-NEXT: movntiq %rax, 16(%rdi) 601; SSE-NEXT: movntiq %rax, 8(%rdi) 602; SSE-NEXT: movntiq %rax, (%rdi) 603; SSE-NEXT: movntiq %rax, 56(%rdi) 604; SSE-NEXT: movntiq %rax, 48(%rdi) 605; SSE-NEXT: movntiq %rax, 40(%rdi) 606; SSE-NEXT: movntiq %rax, 32(%rdi) 607; SSE-NEXT: retq 608; 609; AVX-LABEL: test_zero_v8f64_align1: 610; AVX: # %bb.0: 611; AVX-NEXT: xorl %eax, %eax 612; AVX-NEXT: movntiq %rax, 24(%rdi) 613; AVX-NEXT: movntiq %rax, 16(%rdi) 614; AVX-NEXT: movntiq %rax, 8(%rdi) 615; AVX-NEXT: movntiq %rax, (%rdi) 616; AVX-NEXT: movntiq %rax, 56(%rdi) 617; AVX-NEXT: movntiq %rax, 48(%rdi) 618; AVX-NEXT: movntiq %rax, 40(%rdi) 619; AVX-NEXT: movntiq %rax, 32(%rdi) 620; AVX-NEXT: retq 621; 622; AVX512-LABEL: test_zero_v8f64_align1: 623; AVX512: # %bb.0: 624; AVX512-NEXT: xorl %eax, %eax 625; AVX512-NEXT: movntiq %rax, 24(%rdi) 626; AVX512-NEXT: movntiq %rax, 16(%rdi) 627; AVX512-NEXT: movntiq %rax, 8(%rdi) 628; AVX512-NEXT: movntiq %rax, (%rdi) 629; AVX512-NEXT: movntiq %rax, 56(%rdi) 630; AVX512-NEXT: movntiq %rax, 48(%rdi) 631; AVX512-NEXT: movntiq %rax, 40(%rdi) 632; AVX512-NEXT: movntiq %rax, 32(%rdi) 633; AVX512-NEXT: retq 634 store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1 635 ret void 636} 637 638define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind { 639; SSE2-LABEL: test_zero_v16f32_align1: 640; SSE2: # %bb.0: 641; SSE2-NEXT: xorl %eax, %eax 642; SSE2-NEXT: movntiq %rax, 24(%rdi) 643; SSE2-NEXT: movntiq %rax, 16(%rdi) 644; SSE2-NEXT: movntiq %rax, 8(%rdi) 645; SSE2-NEXT: movntiq %rax, (%rdi) 646; SSE2-NEXT: movntiq %rax, 56(%rdi) 647; SSE2-NEXT: movntiq %rax, 48(%rdi) 648; SSE2-NEXT: movntiq %rax, 40(%rdi) 649; SSE2-NEXT: movntiq %rax, 32(%rdi) 650; SSE2-NEXT: retq 651; 652; SSE4A-LABEL: test_zero_v16f32_align1: 653; SSE4A: # %bb.0: 654; SSE4A-NEXT: xorl %eax, %eax 655; SSE4A-NEXT: movntiq %rax, 24(%rdi) 656; SSE4A-NEXT: movntiq %rax, 8(%rdi) 657; SSE4A-NEXT: movntiq %rax, 56(%rdi) 658; SSE4A-NEXT: movntiq %rax, 40(%rdi) 659; SSE4A-NEXT: xorps %xmm0, %xmm0 660; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 661; SSE4A-NEXT: movntsd %xmm0, (%rdi) 662; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) 663; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) 664; SSE4A-NEXT: retq 665; 666; SSE41-LABEL: test_zero_v16f32_align1: 667; SSE41: # %bb.0: 668; SSE41-NEXT: xorl %eax, %eax 669; SSE41-NEXT: movntiq %rax, 24(%rdi) 670; SSE41-NEXT: movntiq %rax, 16(%rdi) 671; SSE41-NEXT: movntiq %rax, 8(%rdi) 672; SSE41-NEXT: movntiq %rax, (%rdi) 673; SSE41-NEXT: movntiq %rax, 56(%rdi) 674; SSE41-NEXT: movntiq %rax, 48(%rdi) 675; SSE41-NEXT: movntiq %rax, 40(%rdi) 676; SSE41-NEXT: movntiq %rax, 32(%rdi) 677; SSE41-NEXT: retq 678; 679; AVX-LABEL: test_zero_v16f32_align1: 680; AVX: # %bb.0: 681; AVX-NEXT: xorl %eax, %eax 682; AVX-NEXT: movntiq %rax, 24(%rdi) 683; AVX-NEXT: movntiq %rax, 16(%rdi) 684; AVX-NEXT: movntiq %rax, 8(%rdi) 685; AVX-NEXT: movntiq %rax, (%rdi) 686; AVX-NEXT: movntiq %rax, 56(%rdi) 687; AVX-NEXT: movntiq %rax, 48(%rdi) 688; AVX-NEXT: movntiq %rax, 40(%rdi) 689; AVX-NEXT: movntiq %rax, 32(%rdi) 690; AVX-NEXT: retq 691; 692; AVX512-LABEL: test_zero_v16f32_align1: 693; AVX512: # %bb.0: 694; AVX512-NEXT: xorl %eax, %eax 695; AVX512-NEXT: movntiq %rax, 24(%rdi) 696; AVX512-NEXT: movntiq %rax, 16(%rdi) 697; AVX512-NEXT: movntiq %rax, 8(%rdi) 698; AVX512-NEXT: movntiq %rax, (%rdi) 699; AVX512-NEXT: movntiq %rax, 56(%rdi) 700; AVX512-NEXT: movntiq %rax, 48(%rdi) 701; AVX512-NEXT: movntiq %rax, 40(%rdi) 702; AVX512-NEXT: movntiq %rax, 32(%rdi) 703; AVX512-NEXT: retq 704 store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1 705 ret void 706} 707 708define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind { 709; SSE2-LABEL: test_zero_v8i64_align1: 710; SSE2: # %bb.0: 711; SSE2-NEXT: xorl %eax, %eax 712; SSE2-NEXT: movntiq %rax, 24(%rdi) 713; SSE2-NEXT: movntiq %rax, 16(%rdi) 714; SSE2-NEXT: movntiq %rax, 8(%rdi) 715; SSE2-NEXT: movntiq %rax, (%rdi) 716; SSE2-NEXT: movntiq %rax, 56(%rdi) 717; SSE2-NEXT: movntiq %rax, 48(%rdi) 718; SSE2-NEXT: movntiq %rax, 40(%rdi) 719; SSE2-NEXT: movntiq %rax, 32(%rdi) 720; SSE2-NEXT: retq 721; 722; SSE4A-LABEL: test_zero_v8i64_align1: 723; SSE4A: # %bb.0: 724; SSE4A-NEXT: xorps %xmm0, %xmm0 725; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) 726; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 727; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) 728; SSE4A-NEXT: movntsd %xmm0, (%rdi) 729; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) 730; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) 731; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) 732; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) 733; SSE4A-NEXT: retq 734; 735; SSE41-LABEL: test_zero_v8i64_align1: 736; SSE41: # %bb.0: 737; SSE41-NEXT: xorl %eax, %eax 738; SSE41-NEXT: movntiq %rax, 24(%rdi) 739; SSE41-NEXT: movntiq %rax, 16(%rdi) 740; SSE41-NEXT: movntiq %rax, 8(%rdi) 741; SSE41-NEXT: movntiq %rax, (%rdi) 742; SSE41-NEXT: movntiq %rax, 56(%rdi) 743; SSE41-NEXT: movntiq %rax, 48(%rdi) 744; SSE41-NEXT: movntiq %rax, 40(%rdi) 745; SSE41-NEXT: movntiq %rax, 32(%rdi) 746; SSE41-NEXT: retq 747; 748; AVX-LABEL: test_zero_v8i64_align1: 749; AVX: # %bb.0: 750; AVX-NEXT: xorl %eax, %eax 751; AVX-NEXT: movntiq %rax, 24(%rdi) 752; AVX-NEXT: movntiq %rax, 16(%rdi) 753; AVX-NEXT: movntiq %rax, 8(%rdi) 754; AVX-NEXT: movntiq %rax, (%rdi) 755; AVX-NEXT: movntiq %rax, 56(%rdi) 756; AVX-NEXT: movntiq %rax, 48(%rdi) 757; AVX-NEXT: movntiq %rax, 40(%rdi) 758; AVX-NEXT: movntiq %rax, 32(%rdi) 759; AVX-NEXT: retq 760; 761; AVX512-LABEL: test_zero_v8i64_align1: 762; AVX512: # %bb.0: 763; AVX512-NEXT: xorl %eax, %eax 764; AVX512-NEXT: movntiq %rax, 24(%rdi) 765; AVX512-NEXT: movntiq %rax, 16(%rdi) 766; AVX512-NEXT: movntiq %rax, 8(%rdi) 767; AVX512-NEXT: movntiq %rax, (%rdi) 768; AVX512-NEXT: movntiq %rax, 56(%rdi) 769; AVX512-NEXT: movntiq %rax, 48(%rdi) 770; AVX512-NEXT: movntiq %rax, 40(%rdi) 771; AVX512-NEXT: movntiq %rax, 32(%rdi) 772; AVX512-NEXT: retq 773 store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1 774 ret void 775} 776 777define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind { 778; SSE2-LABEL: test_zero_v16i32_align1: 779; SSE2: # %bb.0: 780; SSE2-NEXT: xorl %eax, %eax 781; SSE2-NEXT: movntiq %rax, 24(%rdi) 782; SSE2-NEXT: movntiq %rax, 16(%rdi) 783; SSE2-NEXT: movntiq %rax, 8(%rdi) 784; SSE2-NEXT: movntiq %rax, (%rdi) 785; SSE2-NEXT: movntiq %rax, 56(%rdi) 786; SSE2-NEXT: movntiq %rax, 48(%rdi) 787; SSE2-NEXT: movntiq %rax, 40(%rdi) 788; SSE2-NEXT: movntiq %rax, 32(%rdi) 789; SSE2-NEXT: retq 790; 791; SSE4A-LABEL: test_zero_v16i32_align1: 792; SSE4A: # %bb.0: 793; SSE4A-NEXT: xorps %xmm0, %xmm0 794; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) 795; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 796; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) 797; SSE4A-NEXT: movntsd %xmm0, (%rdi) 798; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) 799; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) 800; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) 801; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) 802; SSE4A-NEXT: retq 803; 804; SSE41-LABEL: test_zero_v16i32_align1: 805; SSE41: # %bb.0: 806; SSE41-NEXT: xorl %eax, %eax 807; SSE41-NEXT: movntiq %rax, 24(%rdi) 808; SSE41-NEXT: movntiq %rax, 16(%rdi) 809; SSE41-NEXT: movntiq %rax, 8(%rdi) 810; SSE41-NEXT: movntiq %rax, (%rdi) 811; SSE41-NEXT: movntiq %rax, 56(%rdi) 812; SSE41-NEXT: movntiq %rax, 48(%rdi) 813; SSE41-NEXT: movntiq %rax, 40(%rdi) 814; SSE41-NEXT: movntiq %rax, 32(%rdi) 815; SSE41-NEXT: retq 816; 817; AVX-LABEL: test_zero_v16i32_align1: 818; AVX: # %bb.0: 819; AVX-NEXT: xorl %eax, %eax 820; AVX-NEXT: movntiq %rax, 24(%rdi) 821; AVX-NEXT: movntiq %rax, 16(%rdi) 822; AVX-NEXT: movntiq %rax, 8(%rdi) 823; AVX-NEXT: movntiq %rax, (%rdi) 824; AVX-NEXT: movntiq %rax, 56(%rdi) 825; AVX-NEXT: movntiq %rax, 48(%rdi) 826; AVX-NEXT: movntiq %rax, 40(%rdi) 827; AVX-NEXT: movntiq %rax, 32(%rdi) 828; AVX-NEXT: retq 829; 830; AVX512-LABEL: test_zero_v16i32_align1: 831; AVX512: # %bb.0: 832; AVX512-NEXT: xorl %eax, %eax 833; AVX512-NEXT: movntiq %rax, 24(%rdi) 834; AVX512-NEXT: movntiq %rax, 16(%rdi) 835; AVX512-NEXT: movntiq %rax, 8(%rdi) 836; AVX512-NEXT: movntiq %rax, (%rdi) 837; AVX512-NEXT: movntiq %rax, 56(%rdi) 838; AVX512-NEXT: movntiq %rax, 48(%rdi) 839; AVX512-NEXT: movntiq %rax, 40(%rdi) 840; AVX512-NEXT: movntiq %rax, 32(%rdi) 841; AVX512-NEXT: retq 842 store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1 843 ret void 844} 845 846define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind { 847; SSE2-LABEL: test_zero_v32i16_align1: 848; SSE2: # %bb.0: 849; SSE2-NEXT: xorl %eax, %eax 850; SSE2-NEXT: movntiq %rax, 24(%rdi) 851; SSE2-NEXT: movntiq %rax, 16(%rdi) 852; SSE2-NEXT: movntiq %rax, 8(%rdi) 853; SSE2-NEXT: movntiq %rax, (%rdi) 854; SSE2-NEXT: movntiq %rax, 56(%rdi) 855; SSE2-NEXT: movntiq %rax, 48(%rdi) 856; SSE2-NEXT: movntiq %rax, 40(%rdi) 857; SSE2-NEXT: movntiq %rax, 32(%rdi) 858; SSE2-NEXT: retq 859; 860; SSE4A-LABEL: test_zero_v32i16_align1: 861; SSE4A: # %bb.0: 862; SSE4A-NEXT: xorps %xmm0, %xmm0 863; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) 864; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 865; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) 866; SSE4A-NEXT: movntsd %xmm0, (%rdi) 867; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) 868; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) 869; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) 870; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) 871; SSE4A-NEXT: retq 872; 873; SSE41-LABEL: test_zero_v32i16_align1: 874; SSE41: # %bb.0: 875; SSE41-NEXT: xorl %eax, %eax 876; SSE41-NEXT: movntiq %rax, 24(%rdi) 877; SSE41-NEXT: movntiq %rax, 16(%rdi) 878; SSE41-NEXT: movntiq %rax, 8(%rdi) 879; SSE41-NEXT: movntiq %rax, (%rdi) 880; SSE41-NEXT: movntiq %rax, 56(%rdi) 881; SSE41-NEXT: movntiq %rax, 48(%rdi) 882; SSE41-NEXT: movntiq %rax, 40(%rdi) 883; SSE41-NEXT: movntiq %rax, 32(%rdi) 884; SSE41-NEXT: retq 885; 886; AVX-LABEL: test_zero_v32i16_align1: 887; AVX: # %bb.0: 888; AVX-NEXT: xorl %eax, %eax 889; AVX-NEXT: movntiq %rax, 24(%rdi) 890; AVX-NEXT: movntiq %rax, 16(%rdi) 891; AVX-NEXT: movntiq %rax, 8(%rdi) 892; AVX-NEXT: movntiq %rax, (%rdi) 893; AVX-NEXT: movntiq %rax, 56(%rdi) 894; AVX-NEXT: movntiq %rax, 48(%rdi) 895; AVX-NEXT: movntiq %rax, 40(%rdi) 896; AVX-NEXT: movntiq %rax, 32(%rdi) 897; AVX-NEXT: retq 898; 899; AVX512-LABEL: test_zero_v32i16_align1: 900; AVX512: # %bb.0: 901; AVX512-NEXT: xorl %eax, %eax 902; AVX512-NEXT: movntiq %rax, 24(%rdi) 903; AVX512-NEXT: movntiq %rax, 16(%rdi) 904; AVX512-NEXT: movntiq %rax, 8(%rdi) 905; AVX512-NEXT: movntiq %rax, (%rdi) 906; AVX512-NEXT: movntiq %rax, 56(%rdi) 907; AVX512-NEXT: movntiq %rax, 48(%rdi) 908; AVX512-NEXT: movntiq %rax, 40(%rdi) 909; AVX512-NEXT: movntiq %rax, 32(%rdi) 910; AVX512-NEXT: retq 911 store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1 912 ret void 913} 914 915define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind { 916; SSE2-LABEL: test_zero_v64i8_align1: 917; SSE2: # %bb.0: 918; SSE2-NEXT: xorl %eax, %eax 919; SSE2-NEXT: movntiq %rax, 24(%rdi) 920; SSE2-NEXT: movntiq %rax, 16(%rdi) 921; SSE2-NEXT: movntiq %rax, 8(%rdi) 922; SSE2-NEXT: movntiq %rax, (%rdi) 923; SSE2-NEXT: movntiq %rax, 56(%rdi) 924; SSE2-NEXT: movntiq %rax, 48(%rdi) 925; SSE2-NEXT: movntiq %rax, 40(%rdi) 926; SSE2-NEXT: movntiq %rax, 32(%rdi) 927; SSE2-NEXT: retq 928; 929; SSE4A-LABEL: test_zero_v64i8_align1: 930; SSE4A: # %bb.0: 931; SSE4A-NEXT: xorps %xmm0, %xmm0 932; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) 933; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) 934; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) 935; SSE4A-NEXT: movntsd %xmm0, (%rdi) 936; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) 937; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) 938; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) 939; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) 940; SSE4A-NEXT: retq 941; 942; SSE41-LABEL: test_zero_v64i8_align1: 943; SSE41: # %bb.0: 944; SSE41-NEXT: xorl %eax, %eax 945; SSE41-NEXT: movntiq %rax, 24(%rdi) 946; SSE41-NEXT: movntiq %rax, 16(%rdi) 947; SSE41-NEXT: movntiq %rax, 8(%rdi) 948; SSE41-NEXT: movntiq %rax, (%rdi) 949; SSE41-NEXT: movntiq %rax, 56(%rdi) 950; SSE41-NEXT: movntiq %rax, 48(%rdi) 951; SSE41-NEXT: movntiq %rax, 40(%rdi) 952; SSE41-NEXT: movntiq %rax, 32(%rdi) 953; SSE41-NEXT: retq 954; 955; AVX-LABEL: test_zero_v64i8_align1: 956; AVX: # %bb.0: 957; AVX-NEXT: xorl %eax, %eax 958; AVX-NEXT: movntiq %rax, 24(%rdi) 959; AVX-NEXT: movntiq %rax, 16(%rdi) 960; AVX-NEXT: movntiq %rax, 8(%rdi) 961; AVX-NEXT: movntiq %rax, (%rdi) 962; AVX-NEXT: movntiq %rax, 56(%rdi) 963; AVX-NEXT: movntiq %rax, 48(%rdi) 964; AVX-NEXT: movntiq %rax, 40(%rdi) 965; AVX-NEXT: movntiq %rax, 32(%rdi) 966; AVX-NEXT: retq 967; 968; AVX512-LABEL: test_zero_v64i8_align1: 969; AVX512: # %bb.0: 970; AVX512-NEXT: xorl %eax, %eax 971; AVX512-NEXT: movntiq %rax, 24(%rdi) 972; AVX512-NEXT: movntiq %rax, 16(%rdi) 973; AVX512-NEXT: movntiq %rax, 8(%rdi) 974; AVX512-NEXT: movntiq %rax, (%rdi) 975; AVX512-NEXT: movntiq %rax, 56(%rdi) 976; AVX512-NEXT: movntiq %rax, 48(%rdi) 977; AVX512-NEXT: movntiq %rax, 40(%rdi) 978; AVX512-NEXT: movntiq %rax, 32(%rdi) 979; AVX512-NEXT: retq 980 store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1 981 ret void 982} 983 984define void @test_zero_v8f64_align16(<8 x double>* %dst) nounwind { 985; SSE-LABEL: test_zero_v8f64_align16: 986; SSE: # %bb.0: 987; SSE-NEXT: xorps %xmm0, %xmm0 988; SSE-NEXT: movntps %xmm0, 16(%rdi) 989; SSE-NEXT: movntps %xmm0, (%rdi) 990; SSE-NEXT: movntps %xmm0, 48(%rdi) 991; SSE-NEXT: movntps %xmm0, 32(%rdi) 992; SSE-NEXT: retq 993; 994; AVX-LABEL: test_zero_v8f64_align16: 995; AVX: # %bb.0: 996; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 997; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 998; AVX-NEXT: vmovntps %xmm0, (%rdi) 999; AVX-NEXT: vmovntps %xmm0, 48(%rdi) 1000; AVX-NEXT: vmovntps %xmm0, 32(%rdi) 1001; AVX-NEXT: retq 1002; 1003; AVX512-LABEL: test_zero_v8f64_align16: 1004; AVX512: # %bb.0: 1005; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1006; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 1007; AVX512-NEXT: vmovntps %xmm0, (%rdi) 1008; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) 1009; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) 1010; AVX512-NEXT: retq 1011 store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1 1012 ret void 1013} 1014 1015define void @test_zero_v16f32_align16(<16 x float>* %dst) nounwind { 1016; SSE-LABEL: test_zero_v16f32_align16: 1017; SSE: # %bb.0: 1018; SSE-NEXT: xorps %xmm0, %xmm0 1019; SSE-NEXT: movntps %xmm0, 16(%rdi) 1020; SSE-NEXT: movntps %xmm0, (%rdi) 1021; SSE-NEXT: movntps %xmm0, 48(%rdi) 1022; SSE-NEXT: movntps %xmm0, 32(%rdi) 1023; SSE-NEXT: retq 1024; 1025; AVX-LABEL: test_zero_v16f32_align16: 1026; AVX: # %bb.0: 1027; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1028; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 1029; AVX-NEXT: vmovntps %xmm0, (%rdi) 1030; AVX-NEXT: vmovntps %xmm0, 48(%rdi) 1031; AVX-NEXT: vmovntps %xmm0, 32(%rdi) 1032; AVX-NEXT: retq 1033; 1034; AVX512-LABEL: test_zero_v16f32_align16: 1035; AVX512: # %bb.0: 1036; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1037; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 1038; AVX512-NEXT: vmovntps %xmm0, (%rdi) 1039; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) 1040; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) 1041; AVX512-NEXT: retq 1042 store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1 1043 ret void 1044} 1045 1046define void @test_zero_v8i64_align16(<8 x i64>* %dst) nounwind { 1047; SSE-LABEL: test_zero_v8i64_align16: 1048; SSE: # %bb.0: 1049; SSE-NEXT: xorps %xmm0, %xmm0 1050; SSE-NEXT: movntps %xmm0, 16(%rdi) 1051; SSE-NEXT: movntps %xmm0, (%rdi) 1052; SSE-NEXT: movntps %xmm0, 48(%rdi) 1053; SSE-NEXT: movntps %xmm0, 32(%rdi) 1054; SSE-NEXT: retq 1055; 1056; AVX-LABEL: test_zero_v8i64_align16: 1057; AVX: # %bb.0: 1058; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1059; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 1060; AVX-NEXT: vmovntps %xmm0, (%rdi) 1061; AVX-NEXT: vmovntps %xmm0, 48(%rdi) 1062; AVX-NEXT: vmovntps %xmm0, 32(%rdi) 1063; AVX-NEXT: retq 1064; 1065; AVX512-LABEL: test_zero_v8i64_align16: 1066; AVX512: # %bb.0: 1067; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1068; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 1069; AVX512-NEXT: vmovntps %xmm0, (%rdi) 1070; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) 1071; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) 1072; AVX512-NEXT: retq 1073 store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1 1074 ret void 1075} 1076 1077define void @test_zero_v16i32_align16(<16 x i32>* %dst) nounwind { 1078; SSE-LABEL: test_zero_v16i32_align16: 1079; SSE: # %bb.0: 1080; SSE-NEXT: xorps %xmm0, %xmm0 1081; SSE-NEXT: movntps %xmm0, 16(%rdi) 1082; SSE-NEXT: movntps %xmm0, (%rdi) 1083; SSE-NEXT: movntps %xmm0, 48(%rdi) 1084; SSE-NEXT: movntps %xmm0, 32(%rdi) 1085; SSE-NEXT: retq 1086; 1087; AVX-LABEL: test_zero_v16i32_align16: 1088; AVX: # %bb.0: 1089; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1090; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 1091; AVX-NEXT: vmovntps %xmm0, (%rdi) 1092; AVX-NEXT: vmovntps %xmm0, 48(%rdi) 1093; AVX-NEXT: vmovntps %xmm0, 32(%rdi) 1094; AVX-NEXT: retq 1095; 1096; AVX512-LABEL: test_zero_v16i32_align16: 1097; AVX512: # %bb.0: 1098; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1099; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 1100; AVX512-NEXT: vmovntps %xmm0, (%rdi) 1101; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) 1102; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) 1103; AVX512-NEXT: retq 1104 store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1 1105 ret void 1106} 1107 1108define void @test_zero_v32i16_align16(<32 x i16>* %dst) nounwind { 1109; SSE-LABEL: test_zero_v32i16_align16: 1110; SSE: # %bb.0: 1111; SSE-NEXT: xorps %xmm0, %xmm0 1112; SSE-NEXT: movntps %xmm0, 16(%rdi) 1113; SSE-NEXT: movntps %xmm0, (%rdi) 1114; SSE-NEXT: movntps %xmm0, 48(%rdi) 1115; SSE-NEXT: movntps %xmm0, 32(%rdi) 1116; SSE-NEXT: retq 1117; 1118; AVX-LABEL: test_zero_v32i16_align16: 1119; AVX: # %bb.0: 1120; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1121; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 1122; AVX-NEXT: vmovntps %xmm0, (%rdi) 1123; AVX-NEXT: vmovntps %xmm0, 48(%rdi) 1124; AVX-NEXT: vmovntps %xmm0, 32(%rdi) 1125; AVX-NEXT: retq 1126; 1127; AVX512-LABEL: test_zero_v32i16_align16: 1128; AVX512: # %bb.0: 1129; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1130; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 1131; AVX512-NEXT: vmovntps %xmm0, (%rdi) 1132; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) 1133; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) 1134; AVX512-NEXT: retq 1135 store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1 1136 ret void 1137} 1138 1139define void @test_zero_v64i8_align16(<64 x i8>* %dst) nounwind { 1140; SSE-LABEL: test_zero_v64i8_align16: 1141; SSE: # %bb.0: 1142; SSE-NEXT: xorps %xmm0, %xmm0 1143; SSE-NEXT: movntps %xmm0, 16(%rdi) 1144; SSE-NEXT: movntps %xmm0, (%rdi) 1145; SSE-NEXT: movntps %xmm0, 48(%rdi) 1146; SSE-NEXT: movntps %xmm0, 32(%rdi) 1147; SSE-NEXT: retq 1148; 1149; AVX-LABEL: test_zero_v64i8_align16: 1150; AVX: # %bb.0: 1151; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1152; AVX-NEXT: vmovntps %xmm0, 16(%rdi) 1153; AVX-NEXT: vmovntps %xmm0, (%rdi) 1154; AVX-NEXT: vmovntps %xmm0, 48(%rdi) 1155; AVX-NEXT: vmovntps %xmm0, 32(%rdi) 1156; AVX-NEXT: retq 1157; 1158; AVX512-LABEL: test_zero_v64i8_align16: 1159; AVX512: # %bb.0: 1160; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1161; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) 1162; AVX512-NEXT: vmovntps %xmm0, (%rdi) 1163; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) 1164; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) 1165; AVX512-NEXT: retq 1166 store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1 1167 ret void 1168} 1169 1170define void @test_zero_v8f64_align32(<8 x double>* %dst) nounwind { 1171; SSE-LABEL: test_zero_v8f64_align32: 1172; SSE: # %bb.0: 1173; SSE-NEXT: xorps %xmm0, %xmm0 1174; SSE-NEXT: movntps %xmm0, 48(%rdi) 1175; SSE-NEXT: movntps %xmm0, 32(%rdi) 1176; SSE-NEXT: movntps %xmm0, 16(%rdi) 1177; SSE-NEXT: movntps %xmm0, (%rdi) 1178; SSE-NEXT: retq 1179; 1180; AVX-LABEL: test_zero_v8f64_align32: 1181; AVX: # %bb.0: 1182; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1183; AVX-NEXT: vmovntps %ymm0, 32(%rdi) 1184; AVX-NEXT: vmovntps %ymm0, (%rdi) 1185; AVX-NEXT: vzeroupper 1186; AVX-NEXT: retq 1187; 1188; AVX512-LABEL: test_zero_v8f64_align32: 1189; AVX512: # %bb.0: 1190; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1191; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) 1192; AVX512-NEXT: vmovntps %ymm0, (%rdi) 1193; AVX512-NEXT: vzeroupper 1194; AVX512-NEXT: retq 1195 store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1 1196 ret void 1197} 1198 1199define void @test_zero_v16f32_align32(<16 x float>* %dst) nounwind { 1200; SSE-LABEL: test_zero_v16f32_align32: 1201; SSE: # %bb.0: 1202; SSE-NEXT: xorps %xmm0, %xmm0 1203; SSE-NEXT: movntps %xmm0, 48(%rdi) 1204; SSE-NEXT: movntps %xmm0, 32(%rdi) 1205; SSE-NEXT: movntps %xmm0, 16(%rdi) 1206; SSE-NEXT: movntps %xmm0, (%rdi) 1207; SSE-NEXT: retq 1208; 1209; AVX-LABEL: test_zero_v16f32_align32: 1210; AVX: # %bb.0: 1211; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1212; AVX-NEXT: vmovntps %ymm0, 32(%rdi) 1213; AVX-NEXT: vmovntps %ymm0, (%rdi) 1214; AVX-NEXT: vzeroupper 1215; AVX-NEXT: retq 1216; 1217; AVX512-LABEL: test_zero_v16f32_align32: 1218; AVX512: # %bb.0: 1219; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1220; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) 1221; AVX512-NEXT: vmovntps %ymm0, (%rdi) 1222; AVX512-NEXT: vzeroupper 1223; AVX512-NEXT: retq 1224 store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1 1225 ret void 1226} 1227 1228define void @test_zero_v8i64_align32(<8 x i64>* %dst) nounwind { 1229; SSE-LABEL: test_zero_v8i64_align32: 1230; SSE: # %bb.0: 1231; SSE-NEXT: xorps %xmm0, %xmm0 1232; SSE-NEXT: movntps %xmm0, 48(%rdi) 1233; SSE-NEXT: movntps %xmm0, 32(%rdi) 1234; SSE-NEXT: movntps %xmm0, 16(%rdi) 1235; SSE-NEXT: movntps %xmm0, (%rdi) 1236; SSE-NEXT: retq 1237; 1238; AVX-LABEL: test_zero_v8i64_align32: 1239; AVX: # %bb.0: 1240; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1241; AVX-NEXT: vmovntps %ymm0, 32(%rdi) 1242; AVX-NEXT: vmovntps %ymm0, (%rdi) 1243; AVX-NEXT: vzeroupper 1244; AVX-NEXT: retq 1245; 1246; AVX512-LABEL: test_zero_v8i64_align32: 1247; AVX512: # %bb.0: 1248; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1249; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) 1250; AVX512-NEXT: vmovntps %ymm0, (%rdi) 1251; AVX512-NEXT: vzeroupper 1252; AVX512-NEXT: retq 1253 store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1 1254 ret void 1255} 1256 1257define void @test_zero_v16i32_align32(<16 x i32>* %dst) nounwind { 1258; SSE-LABEL: test_zero_v16i32_align32: 1259; SSE: # %bb.0: 1260; SSE-NEXT: xorps %xmm0, %xmm0 1261; SSE-NEXT: movntps %xmm0, 48(%rdi) 1262; SSE-NEXT: movntps %xmm0, 32(%rdi) 1263; SSE-NEXT: movntps %xmm0, 16(%rdi) 1264; SSE-NEXT: movntps %xmm0, (%rdi) 1265; SSE-NEXT: retq 1266; 1267; AVX-LABEL: test_zero_v16i32_align32: 1268; AVX: # %bb.0: 1269; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1270; AVX-NEXT: vmovntps %ymm0, 32(%rdi) 1271; AVX-NEXT: vmovntps %ymm0, (%rdi) 1272; AVX-NEXT: vzeroupper 1273; AVX-NEXT: retq 1274; 1275; AVX512-LABEL: test_zero_v16i32_align32: 1276; AVX512: # %bb.0: 1277; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1278; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) 1279; AVX512-NEXT: vmovntps %ymm0, (%rdi) 1280; AVX512-NEXT: vzeroupper 1281; AVX512-NEXT: retq 1282 store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1 1283 ret void 1284} 1285 1286define void @test_zero_v32i16_align32(<32 x i16>* %dst) nounwind { 1287; SSE-LABEL: test_zero_v32i16_align32: 1288; SSE: # %bb.0: 1289; SSE-NEXT: xorps %xmm0, %xmm0 1290; SSE-NEXT: movntps %xmm0, 48(%rdi) 1291; SSE-NEXT: movntps %xmm0, 32(%rdi) 1292; SSE-NEXT: movntps %xmm0, 16(%rdi) 1293; SSE-NEXT: movntps %xmm0, (%rdi) 1294; SSE-NEXT: retq 1295; 1296; AVX-LABEL: test_zero_v32i16_align32: 1297; AVX: # %bb.0: 1298; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1299; AVX-NEXT: vmovntps %ymm0, 32(%rdi) 1300; AVX-NEXT: vmovntps %ymm0, (%rdi) 1301; AVX-NEXT: vzeroupper 1302; AVX-NEXT: retq 1303; 1304; AVX512-LABEL: test_zero_v32i16_align32: 1305; AVX512: # %bb.0: 1306; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1307; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) 1308; AVX512-NEXT: vmovntps %ymm0, (%rdi) 1309; AVX512-NEXT: vzeroupper 1310; AVX512-NEXT: retq 1311 store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1 1312 ret void 1313} 1314 1315define void @test_zero_v64i8_align32(<64 x i8>* %dst) nounwind { 1316; SSE-LABEL: test_zero_v64i8_align32: 1317; SSE: # %bb.0: 1318; SSE-NEXT: xorps %xmm0, %xmm0 1319; SSE-NEXT: movntps %xmm0, 48(%rdi) 1320; SSE-NEXT: movntps %xmm0, 32(%rdi) 1321; SSE-NEXT: movntps %xmm0, 16(%rdi) 1322; SSE-NEXT: movntps %xmm0, (%rdi) 1323; SSE-NEXT: retq 1324; 1325; AVX-LABEL: test_zero_v64i8_align32: 1326; AVX: # %bb.0: 1327; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1328; AVX-NEXT: vmovntps %ymm0, 32(%rdi) 1329; AVX-NEXT: vmovntps %ymm0, (%rdi) 1330; AVX-NEXT: vzeroupper 1331; AVX-NEXT: retq 1332; 1333; AVX512-LABEL: test_zero_v64i8_align32: 1334; AVX512: # %bb.0: 1335; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1336; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) 1337; AVX512-NEXT: vmovntps %ymm0, (%rdi) 1338; AVX512-NEXT: vzeroupper 1339; AVX512-NEXT: retq 1340 store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1 1341 ret void 1342} 1343 1344!1 = !{i32 1} 1345