1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE 3; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE 4; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST 5; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 7; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 8; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 9; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 10; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 11; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 12 13; https://llvm.org/bugs/show_bug.cgi?id=27100 14 15define void @memset_16_nonzero_bytes(i8* %x) { 16; SSE-LABEL: memset_16_nonzero_bytes: 17; SSE: # %bb.0: 18; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 19; SSE-NEXT: movq %rax, 8(%rdi) 20; SSE-NEXT: movq %rax, (%rdi) 21; SSE-NEXT: retq 22; 23; SSE2FAST-LABEL: memset_16_nonzero_bytes: 24; SSE2FAST: # %bb.0: 25; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 26; SSE2FAST-NEXT: movups %xmm0, (%rdi) 27; SSE2FAST-NEXT: retq 28; 29; AVX-LABEL: memset_16_nonzero_bytes: 30; AVX: # %bb.0: 31; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 32; AVX-NEXT: vmovups %xmm0, (%rdi) 33; AVX-NEXT: retq 34 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1) 35 ret void 36} 37 38define void @memset_32_nonzero_bytes(i8* %x) { 39; SSE-LABEL: memset_32_nonzero_bytes: 40; SSE: # %bb.0: 41; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 42; SSE-NEXT: movq %rax, 24(%rdi) 43; SSE-NEXT: movq %rax, 16(%rdi) 44; SSE-NEXT: movq %rax, 8(%rdi) 45; SSE-NEXT: movq %rax, (%rdi) 46; SSE-NEXT: retq 47; 48; SSE2FAST-LABEL: memset_32_nonzero_bytes: 49; SSE2FAST: # %bb.0: 50; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 51; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 52; SSE2FAST-NEXT: movups %xmm0, (%rdi) 53; SSE2FAST-NEXT: retq 54; 55; AVX-LABEL: memset_32_nonzero_bytes: 56; AVX: # %bb.0: 57; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 58; AVX-NEXT: vmovups %ymm0, (%rdi) 59; AVX-NEXT: vzeroupper 60; AVX-NEXT: retq 61 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1) 62 ret void 63} 64 65define void @memset_64_nonzero_bytes(i8* %x) { 66; SSE-LABEL: memset_64_nonzero_bytes: 67; SSE: # %bb.0: 68; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 69; SSE-NEXT: movq %rax, 56(%rdi) 70; SSE-NEXT: movq %rax, 48(%rdi) 71; SSE-NEXT: movq %rax, 40(%rdi) 72; SSE-NEXT: movq %rax, 32(%rdi) 73; SSE-NEXT: movq %rax, 24(%rdi) 74; SSE-NEXT: movq %rax, 16(%rdi) 75; SSE-NEXT: movq %rax, 8(%rdi) 76; SSE-NEXT: movq %rax, (%rdi) 77; SSE-NEXT: retq 78; 79; SSE2FAST-LABEL: memset_64_nonzero_bytes: 80; SSE2FAST: # %bb.0: 81; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 82; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 83; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 84; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 85; SSE2FAST-NEXT: movups %xmm0, (%rdi) 86; SSE2FAST-NEXT: retq 87; 88; AVX1-LABEL: memset_64_nonzero_bytes: 89; AVX1: # %bb.0: 90; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 91; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 92; AVX1-NEXT: vmovups %ymm0, (%rdi) 93; AVX1-NEXT: vzeroupper 94; AVX1-NEXT: retq 95; 96; AVX2-LABEL: memset_64_nonzero_bytes: 97; AVX2: # %bb.0: 98; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 99; AVX2-NEXT: vmovups %ymm0, 32(%rdi) 100; AVX2-NEXT: vmovups %ymm0, (%rdi) 101; AVX2-NEXT: vzeroupper 102; AVX2-NEXT: retq 103; 104; AVX512F-LABEL: memset_64_nonzero_bytes: 105; AVX512F: # %bb.0: 106; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] 107; AVX512F-NEXT: vmovups %zmm0, (%rdi) 108; AVX512F-NEXT: vzeroupper 109; AVX512F-NEXT: retq 110; 111; AVX512BW-LABEL: memset_64_nonzero_bytes: 112; AVX512BW: # %bb.0: 113; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 114; AVX512BW-NEXT: vmovups %zmm0, (%rdi) 115; AVX512BW-NEXT: vzeroupper 116; AVX512BW-NEXT: retq 117; AVX512NW-NEXT: retq 118 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1) 119 ret void 120} 121 122define void @memset_128_nonzero_bytes(i8* %x) { 123; SSE-LABEL: memset_128_nonzero_bytes: 124; SSE: # %bb.0: 125; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 126; SSE-NEXT: movq %rax, 120(%rdi) 127; SSE-NEXT: movq %rax, 112(%rdi) 128; SSE-NEXT: movq %rax, 104(%rdi) 129; SSE-NEXT: movq %rax, 96(%rdi) 130; SSE-NEXT: movq %rax, 88(%rdi) 131; SSE-NEXT: movq %rax, 80(%rdi) 132; SSE-NEXT: movq %rax, 72(%rdi) 133; SSE-NEXT: movq %rax, 64(%rdi) 134; SSE-NEXT: movq %rax, 56(%rdi) 135; SSE-NEXT: movq %rax, 48(%rdi) 136; SSE-NEXT: movq %rax, 40(%rdi) 137; SSE-NEXT: movq %rax, 32(%rdi) 138; SSE-NEXT: movq %rax, 24(%rdi) 139; SSE-NEXT: movq %rax, 16(%rdi) 140; SSE-NEXT: movq %rax, 8(%rdi) 141; SSE-NEXT: movq %rax, (%rdi) 142; SSE-NEXT: retq 143; 144; SSE2FAST-LABEL: memset_128_nonzero_bytes: 145; SSE2FAST: # %bb.0: 146; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 147; SSE2FAST-NEXT: movups %xmm0, 112(%rdi) 148; SSE2FAST-NEXT: movups %xmm0, 96(%rdi) 149; SSE2FAST-NEXT: movups %xmm0, 80(%rdi) 150; SSE2FAST-NEXT: movups %xmm0, 64(%rdi) 151; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 152; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 153; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 154; SSE2FAST-NEXT: movups %xmm0, (%rdi) 155; SSE2FAST-NEXT: retq 156; 157; AVX1-LABEL: memset_128_nonzero_bytes: 158; AVX1: # %bb.0: 159; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 160; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 161; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 162; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 163; AVX1-NEXT: vmovups %ymm0, (%rdi) 164; AVX1-NEXT: vzeroupper 165; AVX1-NEXT: retq 166; 167; AVX2-LABEL: memset_128_nonzero_bytes: 168; AVX2: # %bb.0: 169; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 170; AVX2-NEXT: vmovups %ymm0, 96(%rdi) 171; AVX2-NEXT: vmovups %ymm0, 64(%rdi) 172; AVX2-NEXT: vmovups %ymm0, 32(%rdi) 173; AVX2-NEXT: vmovups %ymm0, (%rdi) 174; AVX2-NEXT: vzeroupper 175; AVX2-NEXT: retq 176; 177; AVX512F-LABEL: memset_128_nonzero_bytes: 178; AVX512F: # %bb.0: 179; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] 180; AVX512F-NEXT: vmovups %zmm0, 64(%rdi) 181; AVX512F-NEXT: vmovups %zmm0, (%rdi) 182; AVX512F-NEXT: vzeroupper 183; AVX512F-NEXT: retq 184; 185; AVX512BW-LABEL: memset_128_nonzero_bytes: 186; AVX512BW: # %bb.0: 187; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 188; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) 189; AVX512BW-NEXT: vmovups %zmm0, (%rdi) 190; AVX512BW-NEXT: vzeroupper 191; AVX512BW-NEXT: retq 192 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1) 193 ret void 194} 195 196define void @memset_256_nonzero_bytes(i8* %x) { 197; SSE-LABEL: memset_256_nonzero_bytes: 198; SSE: # %bb.0: 199; SSE-NEXT: pushq %rax 200; SSE-NEXT: .cfi_def_cfa_offset 16 201; SSE-NEXT: movl $256, %edx # imm = 0x100 202; SSE-NEXT: movl $42, %esi 203; SSE-NEXT: callq memset 204; SSE-NEXT: popq %rax 205; SSE-NEXT: .cfi_def_cfa_offset 8 206; SSE-NEXT: retq 207; 208; SSE2FAST-LABEL: memset_256_nonzero_bytes: 209; SSE2FAST: # %bb.0: 210; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 211; SSE2FAST-NEXT: movups %xmm0, 240(%rdi) 212; SSE2FAST-NEXT: movups %xmm0, 224(%rdi) 213; SSE2FAST-NEXT: movups %xmm0, 208(%rdi) 214; SSE2FAST-NEXT: movups %xmm0, 192(%rdi) 215; SSE2FAST-NEXT: movups %xmm0, 176(%rdi) 216; SSE2FAST-NEXT: movups %xmm0, 160(%rdi) 217; SSE2FAST-NEXT: movups %xmm0, 144(%rdi) 218; SSE2FAST-NEXT: movups %xmm0, 128(%rdi) 219; SSE2FAST-NEXT: movups %xmm0, 112(%rdi) 220; SSE2FAST-NEXT: movups %xmm0, 96(%rdi) 221; SSE2FAST-NEXT: movups %xmm0, 80(%rdi) 222; SSE2FAST-NEXT: movups %xmm0, 64(%rdi) 223; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 224; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 225; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 226; SSE2FAST-NEXT: movups %xmm0, (%rdi) 227; SSE2FAST-NEXT: retq 228; 229; AVX1-LABEL: memset_256_nonzero_bytes: 230; AVX1: # %bb.0: 231; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 232; AVX1-NEXT: vmovups %ymm0, 224(%rdi) 233; AVX1-NEXT: vmovups %ymm0, 192(%rdi) 234; AVX1-NEXT: vmovups %ymm0, 160(%rdi) 235; AVX1-NEXT: vmovups %ymm0, 128(%rdi) 236; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 237; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 238; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 239; AVX1-NEXT: vmovups %ymm0, (%rdi) 240; AVX1-NEXT: vzeroupper 241; AVX1-NEXT: retq 242; 243; AVX2-LABEL: memset_256_nonzero_bytes: 244; AVX2: # %bb.0: 245; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 246; AVX2-NEXT: vmovups %ymm0, 224(%rdi) 247; AVX2-NEXT: vmovups %ymm0, 192(%rdi) 248; AVX2-NEXT: vmovups %ymm0, 160(%rdi) 249; AVX2-NEXT: vmovups %ymm0, 128(%rdi) 250; AVX2-NEXT: vmovups %ymm0, 96(%rdi) 251; AVX2-NEXT: vmovups %ymm0, 64(%rdi) 252; AVX2-NEXT: vmovups %ymm0, 32(%rdi) 253; AVX2-NEXT: vmovups %ymm0, (%rdi) 254; AVX2-NEXT: vzeroupper 255; AVX2-NEXT: retq 256; 257; AVX512F-LABEL: memset_256_nonzero_bytes: 258; AVX512F: # %bb.0: 259; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] 260; AVX512F-NEXT: vmovups %zmm0, 192(%rdi) 261; AVX512F-NEXT: vmovups %zmm0, 128(%rdi) 262; AVX512F-NEXT: vmovups %zmm0, 64(%rdi) 263; AVX512F-NEXT: vmovups %zmm0, (%rdi) 264; AVX512F-NEXT: vzeroupper 265; AVX512F-NEXT: retq 266; 267; AVX512BW-LABEL: memset_256_nonzero_bytes: 268; AVX512BW: # %bb.0: 269; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 270; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi) 271; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi) 272; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) 273; AVX512BW-NEXT: vmovups %zmm0, (%rdi) 274; AVX512BW-NEXT: vzeroupper 275; AVX512BW-NEXT: retq 276 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1) 277 ret void 278} 279 280declare i8* @__memset_chk(i8*, i32, i64, i64) 281 282; Repeat with a non-constant value for the stores. 283 284define void @memset_16_nonconst_bytes(i8* %x, i8 %c) { 285; SSE-LABEL: memset_16_nonconst_bytes: 286; SSE: # %bb.0: 287; SSE-NEXT: # kill: def $esi killed $esi def $rsi 288; SSE-NEXT: movzbl %sil, %eax 289; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 290; SSE-NEXT: imulq %rax, %rcx 291; SSE-NEXT: movq %rcx, 8(%rdi) 292; SSE-NEXT: movq %rcx, (%rdi) 293; SSE-NEXT: retq 294; 295; SSE2FAST-LABEL: memset_16_nonconst_bytes: 296; SSE2FAST: # %bb.0: 297; SSE2FAST-NEXT: movd %esi, %xmm0 298; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 299; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 300; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 301; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 302; SSE2FAST-NEXT: retq 303; 304; AVX1-LABEL: memset_16_nonconst_bytes: 305; AVX1: # %bb.0: 306; AVX1-NEXT: vmovd %esi, %xmm0 307; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 308; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 309; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 310; AVX1-NEXT: retq 311; 312; AVX2-LABEL: memset_16_nonconst_bytes: 313; AVX2: # %bb.0: 314; AVX2-NEXT: vmovd %esi, %xmm0 315; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 316; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 317; AVX2-NEXT: retq 318; 319; AVX512-LABEL: memset_16_nonconst_bytes: 320; AVX512: # %bb.0: 321; AVX512-NEXT: vmovd %esi, %xmm0 322; AVX512-NEXT: vpbroadcastb %xmm0, %xmm0 323; AVX512-NEXT: vmovdqu %xmm0, (%rdi) 324; AVX512-NEXT: retq 325 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i1 false) 326 ret void 327} 328 329define void @memset_32_nonconst_bytes(i8* %x, i8 %c) { 330; SSE-LABEL: memset_32_nonconst_bytes: 331; SSE: # %bb.0: 332; SSE-NEXT: # kill: def $esi killed $esi def $rsi 333; SSE-NEXT: movzbl %sil, %eax 334; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 335; SSE-NEXT: imulq %rax, %rcx 336; SSE-NEXT: movq %rcx, 24(%rdi) 337; SSE-NEXT: movq %rcx, 16(%rdi) 338; SSE-NEXT: movq %rcx, 8(%rdi) 339; SSE-NEXT: movq %rcx, (%rdi) 340; SSE-NEXT: retq 341; 342; SSE2FAST-LABEL: memset_32_nonconst_bytes: 343; SSE2FAST: # %bb.0: 344; SSE2FAST-NEXT: movd %esi, %xmm0 345; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 346; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 347; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 348; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 349; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 350; SSE2FAST-NEXT: retq 351; 352; AVX1-LABEL: memset_32_nonconst_bytes: 353; AVX1: # %bb.0: 354; AVX1-NEXT: vmovd %esi, %xmm0 355; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 356; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 357; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) 358; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 359; AVX1-NEXT: retq 360; 361; AVX2-LABEL: memset_32_nonconst_bytes: 362; AVX2: # %bb.0: 363; AVX2-NEXT: vmovd %esi, %xmm0 364; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 365; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 366; AVX2-NEXT: vzeroupper 367; AVX2-NEXT: retq 368; 369; AVX512-LABEL: memset_32_nonconst_bytes: 370; AVX512: # %bb.0: 371; AVX512-NEXT: vmovd %esi, %xmm0 372; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 373; AVX512-NEXT: vmovdqu %ymm0, (%rdi) 374; AVX512-NEXT: vzeroupper 375; AVX512-NEXT: retq 376 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i1 false) 377 ret void 378} 379 380define void @memset_64_nonconst_bytes(i8* %x, i8 %c) { 381; SSE-LABEL: memset_64_nonconst_bytes: 382; SSE: # %bb.0: 383; SSE-NEXT: # kill: def $esi killed $esi def $rsi 384; SSE-NEXT: movzbl %sil, %eax 385; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 386; SSE-NEXT: imulq %rax, %rcx 387; SSE-NEXT: movq %rcx, 56(%rdi) 388; SSE-NEXT: movq %rcx, 48(%rdi) 389; SSE-NEXT: movq %rcx, 40(%rdi) 390; SSE-NEXT: movq %rcx, 32(%rdi) 391; SSE-NEXT: movq %rcx, 24(%rdi) 392; SSE-NEXT: movq %rcx, 16(%rdi) 393; SSE-NEXT: movq %rcx, 8(%rdi) 394; SSE-NEXT: movq %rcx, (%rdi) 395; SSE-NEXT: retq 396; 397; SSE2FAST-LABEL: memset_64_nonconst_bytes: 398; SSE2FAST: # %bb.0: 399; SSE2FAST-NEXT: movd %esi, %xmm0 400; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 401; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 402; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 403; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 404; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 405; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 406; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 407; SSE2FAST-NEXT: retq 408; 409; AVX1-LABEL: memset_64_nonconst_bytes: 410; AVX1: # %bb.0: 411; AVX1-NEXT: vmovd %esi, %xmm0 412; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 413; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 414; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 415; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 416; AVX1-NEXT: vmovups %ymm0, (%rdi) 417; AVX1-NEXT: vzeroupper 418; AVX1-NEXT: retq 419; 420; AVX2-LABEL: memset_64_nonconst_bytes: 421; AVX2: # %bb.0: 422; AVX2-NEXT: vmovd %esi, %xmm0 423; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 424; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 425; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 426; AVX2-NEXT: vzeroupper 427; AVX2-NEXT: retq 428; 429; AVX512F-LABEL: memset_64_nonconst_bytes: 430; AVX512F: # %bb.0: 431; AVX512F-NEXT: movzbl %sil, %eax 432; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 433; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 434; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) 435; AVX512F-NEXT: vzeroupper 436; AVX512F-NEXT: retq 437; 438; AVX512BW-LABEL: memset_64_nonconst_bytes: 439; AVX512BW: # %bb.0: 440; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0 441; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) 442; AVX512BW-NEXT: vzeroupper 443; AVX512BW-NEXT: retq 444 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i1 false) 445 ret void 446} 447 448define void @memset_128_nonconst_bytes(i8* %x, i8 %c) { 449; SSE-LABEL: memset_128_nonconst_bytes: 450; SSE: # %bb.0: 451; SSE-NEXT: # kill: def $esi killed $esi def $rsi 452; SSE-NEXT: movzbl %sil, %eax 453; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 454; SSE-NEXT: imulq %rax, %rcx 455; SSE-NEXT: movq %rcx, 120(%rdi) 456; SSE-NEXT: movq %rcx, 112(%rdi) 457; SSE-NEXT: movq %rcx, 104(%rdi) 458; SSE-NEXT: movq %rcx, 96(%rdi) 459; SSE-NEXT: movq %rcx, 88(%rdi) 460; SSE-NEXT: movq %rcx, 80(%rdi) 461; SSE-NEXT: movq %rcx, 72(%rdi) 462; SSE-NEXT: movq %rcx, 64(%rdi) 463; SSE-NEXT: movq %rcx, 56(%rdi) 464; SSE-NEXT: movq %rcx, 48(%rdi) 465; SSE-NEXT: movq %rcx, 40(%rdi) 466; SSE-NEXT: movq %rcx, 32(%rdi) 467; SSE-NEXT: movq %rcx, 24(%rdi) 468; SSE-NEXT: movq %rcx, 16(%rdi) 469; SSE-NEXT: movq %rcx, 8(%rdi) 470; SSE-NEXT: movq %rcx, (%rdi) 471; SSE-NEXT: retq 472; 473; SSE2FAST-LABEL: memset_128_nonconst_bytes: 474; SSE2FAST: # %bb.0: 475; SSE2FAST-NEXT: movd %esi, %xmm0 476; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 477; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 478; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 479; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) 480; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) 481; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) 482; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi) 483; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 484; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 485; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 486; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 487; SSE2FAST-NEXT: retq 488; 489; AVX1-LABEL: memset_128_nonconst_bytes: 490; AVX1: # %bb.0: 491; AVX1-NEXT: vmovd %esi, %xmm0 492; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 493; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 494; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 495; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 496; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 497; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 498; AVX1-NEXT: vmovups %ymm0, (%rdi) 499; AVX1-NEXT: vzeroupper 500; AVX1-NEXT: retq 501; 502; AVX2-LABEL: memset_128_nonconst_bytes: 503; AVX2: # %bb.0: 504; AVX2-NEXT: vmovd %esi, %xmm0 505; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 506; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) 507; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) 508; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 509; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 510; AVX2-NEXT: vzeroupper 511; AVX2-NEXT: retq 512; 513; AVX512F-LABEL: memset_128_nonconst_bytes: 514; AVX512F: # %bb.0: 515; AVX512F-NEXT: movzbl %sil, %eax 516; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 517; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 518; AVX512F-NEXT: vmovdqu64 %zmm0, 64(%rdi) 519; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) 520; AVX512F-NEXT: vzeroupper 521; AVX512F-NEXT: retq 522; 523; AVX512BW-LABEL: memset_128_nonconst_bytes: 524; AVX512BW: # %bb.0: 525; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0 526; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi) 527; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) 528; AVX512BW-NEXT: vzeroupper 529; AVX512BW-NEXT: retq 530 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i1 false) 531 ret void 532} 533 534define void @memset_256_nonconst_bytes(i8* %x, i8 %c) { 535; SSE-LABEL: memset_256_nonconst_bytes: 536; SSE: # %bb.0: 537; SSE-NEXT: movl $256, %edx # imm = 0x100 538; SSE-NEXT: jmp memset@PLT # TAILCALL 539; 540; SSE2FAST-LABEL: memset_256_nonconst_bytes: 541; SSE2FAST: # %bb.0: 542; SSE2FAST-NEXT: movd %esi, %xmm0 543; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 544; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 545; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 546; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi) 547; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi) 548; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi) 549; SSE2FAST-NEXT: movdqu %xmm0, 192(%rdi) 550; SSE2FAST-NEXT: movdqu %xmm0, 176(%rdi) 551; SSE2FAST-NEXT: movdqu %xmm0, 160(%rdi) 552; SSE2FAST-NEXT: movdqu %xmm0, 144(%rdi) 553; SSE2FAST-NEXT: movdqu %xmm0, 128(%rdi) 554; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) 555; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) 556; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) 557; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi) 558; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 559; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 560; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 561; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 562; SSE2FAST-NEXT: retq 563; 564; AVX1-LABEL: memset_256_nonconst_bytes: 565; AVX1: # %bb.0: 566; AVX1-NEXT: vmovd %esi, %xmm0 567; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 568; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 569; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 570; AVX1-NEXT: vmovups %ymm0, 224(%rdi) 571; AVX1-NEXT: vmovups %ymm0, 192(%rdi) 572; AVX1-NEXT: vmovups %ymm0, 160(%rdi) 573; AVX1-NEXT: vmovups %ymm0, 128(%rdi) 574; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 575; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 576; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 577; AVX1-NEXT: vmovups %ymm0, (%rdi) 578; AVX1-NEXT: vzeroupper 579; AVX1-NEXT: retq 580; 581; AVX2-LABEL: memset_256_nonconst_bytes: 582; AVX2: # %bb.0: 583; AVX2-NEXT: vmovd %esi, %xmm0 584; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 585; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi) 586; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi) 587; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi) 588; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi) 589; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) 590; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) 591; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 592; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 593; AVX2-NEXT: vzeroupper 594; AVX2-NEXT: retq 595; 596; AVX512F-LABEL: memset_256_nonconst_bytes: 597; AVX512F: # %bb.0: 598; AVX512F-NEXT: movzbl %sil, %eax 599; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 600; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 601; AVX512F-NEXT: vmovdqu64 %zmm0, 192(%rdi) 602; AVX512F-NEXT: vmovdqu64 %zmm0, 128(%rdi) 603; AVX512F-NEXT: vmovdqu64 %zmm0, 64(%rdi) 604; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) 605; AVX512F-NEXT: vzeroupper 606; AVX512F-NEXT: retq 607; 608; AVX512BW-LABEL: memset_256_nonconst_bytes: 609; AVX512BW: # %bb.0: 610; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0 611; AVX512BW-NEXT: vmovdqu64 %zmm0, 192(%rdi) 612; AVX512BW-NEXT: vmovdqu64 %zmm0, 128(%rdi) 613; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi) 614; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) 615; AVX512BW-NEXT: vzeroupper 616; AVX512BW-NEXT: retq 617 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i1 false) 618 ret void 619} 620 621declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 622 623