1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X86-SSE 3; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1 4; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X64-SSE 6; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1 7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512 8 9@g16 = external global i16 10 11define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind { 12; X86-SSE-LABEL: pinsrd_1: 13; X86-SSE: ## %bb.0: 14; X86-SSE-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x44,0x24,0x04,0x01] 15; X86-SSE-NEXT: retl ## encoding: [0xc3] 16; 17; X86-AVX1-LABEL: pinsrd_1: 18; X86-AVX1: ## %bb.0: 19; X86-AVX1-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01] 20; X86-AVX1-NEXT: retl ## encoding: [0xc3] 21; 22; X86-AVX512-LABEL: pinsrd_1: 23; X86-AVX512: ## %bb.0: 24; X86-AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01] 25; X86-AVX512-NEXT: retl ## encoding: [0xc3] 26; 27; X64-SSE-LABEL: pinsrd_1: 28; X64-SSE: ## %bb.0: 29; X64-SSE-NEXT: pinsrd $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0xc7,0x01] 30; X64-SSE-NEXT: retq ## encoding: [0xc3] 31; 32; X64-AVX1-LABEL: pinsrd_1: 33; X64-AVX1: ## %bb.0: 34; X64-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] 35; X64-AVX1-NEXT: retq ## encoding: [0xc3] 36; 37; X64-AVX512-LABEL: pinsrd_1: 38; X64-AVX512: ## %bb.0: 39; X64-AVX512-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] 40; X64-AVX512-NEXT: retq ## encoding: [0xc3] 41 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 42 ret <4 x i32> %tmp1 43} 44 45define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind { 46; X86-SSE-LABEL: pinsrb_1: 47; X86-SSE: ## %bb.0: 48; X86-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0x44,0x24,0x04,0x01] 49; X86-SSE-NEXT: retl ## encoding: [0xc3] 50; 51; X86-AVX1-LABEL: pinsrb_1: 52; X86-AVX1: ## %bb.0: 53; X86-AVX1-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01] 54; X86-AVX1-NEXT: retl ## encoding: [0xc3] 55; 56; X86-AVX512-LABEL: pinsrb_1: 57; X86-AVX512: ## %bb.0: 58; X86-AVX512-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01] 59; X86-AVX512-NEXT: retl ## encoding: [0xc3] 60; 61; X64-SSE-LABEL: pinsrb_1: 62; X64-SSE: ## %bb.0: 63; X64-SSE-NEXT: pinsrb $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0xc7,0x01] 64; X64-SSE-NEXT: retq ## encoding: [0xc3] 65; 66; X64-AVX1-LABEL: pinsrb_1: 67; X64-AVX1: ## %bb.0: 68; X64-AVX1-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01] 69; X64-AVX1-NEXT: retq ## encoding: [0xc3] 70; 71; X64-AVX512-LABEL: pinsrb_1: 72; X64-AVX512: ## %bb.0: 73; X64-AVX512-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01] 74; X64-AVX512-NEXT: retq ## encoding: [0xc3] 75 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 76 ret <16 x i8> %tmp1 77} 78 79define <2 x i64> @pmovzxbq_1() nounwind { 80; X86-SSE-LABEL: pmovzxbq_1: 81; X86-SSE: ## %bb.0: ## %entry 82; X86-SSE-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] 83; X86-SSE-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 84; X86-SSE-NEXT: pmovzxbq (%eax), %xmm0 ## encoding: [0x66,0x0f,0x38,0x32,0x00] 85; X86-SSE-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 86; X86-SSE-NEXT: retl ## encoding: [0xc3] 87; 88; X86-AVX1-LABEL: pmovzxbq_1: 89; X86-AVX1: ## %bb.0: ## %entry 90; X86-AVX1-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] 91; X86-AVX1-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 92; X86-AVX1-NEXT: vpmovzxbq (%eax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00] 93; X86-AVX1-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 94; X86-AVX1-NEXT: retl ## encoding: [0xc3] 95; 96; X86-AVX512-LABEL: pmovzxbq_1: 97; X86-AVX512: ## %bb.0: ## %entry 98; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] 99; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 100; X86-AVX512-NEXT: vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] 101; X86-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 102; X86-AVX512-NEXT: retl ## encoding: [0xc3] 103; 104; X64-SSE-LABEL: pmovzxbq_1: 105; X64-SSE: ## %bb.0: ## %entry 106; X64-SSE-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] 107; X64-SSE-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load 108; X64-SSE-NEXT: pmovzxbq (%rax), %xmm0 ## encoding: [0x66,0x0f,0x38,0x32,0x00] 109; X64-SSE-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 110; X64-SSE-NEXT: retq ## encoding: [0xc3] 111; 112; X64-AVX1-LABEL: pmovzxbq_1: 113; X64-AVX1: ## %bb.0: ## %entry 114; X64-AVX1-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] 115; X64-AVX1-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load 116; X64-AVX1-NEXT: vpmovzxbq (%rax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00] 117; X64-AVX1-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 118; X64-AVX1-NEXT: retq ## encoding: [0xc3] 119; 120; X64-AVX512-LABEL: pmovzxbq_1: 121; X64-AVX512: ## %bb.0: ## %entry 122; X64-AVX512-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] 123; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load 124; X64-AVX512-NEXT: vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] 125; X64-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 126; X64-AVX512-NEXT: retq ## encoding: [0xc3] 127entry: 128 %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1] 129 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] 130 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] 131 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] 132 ret <2 x i64> %3 133} 134 135declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone 136 137define i32 @extractps_1(<4 x float> %v) nounwind { 138; SSE-LABEL: extractps_1: 139; SSE: ## %bb.0: 140; SSE-NEXT: extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03] 141; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 142; 143; AVX1-LABEL: extractps_1: 144; AVX1: ## %bb.0: 145; AVX1-NEXT: vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 146; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 147; 148; AVX512-LABEL: extractps_1: 149; AVX512: ## %bb.0: 150; AVX512-NEXT: vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 151; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 152 %s = extractelement <4 x float> %v, i32 3 153 %i = bitcast float %s to i32 154 ret i32 %i 155} 156define i32 @extractps_2(<4 x float> %v) nounwind { 157; SSE-LABEL: extractps_2: 158; SSE: ## %bb.0: 159; SSE-NEXT: extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03] 160; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 161; 162; AVX1-LABEL: extractps_2: 163; AVX1: ## %bb.0: 164; AVX1-NEXT: vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 165; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 166; 167; AVX512-LABEL: extractps_2: 168; AVX512: ## %bb.0: 169; AVX512-NEXT: vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 170; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 171 %t = bitcast <4 x float> %v to <4 x i32> 172 %s = extractelement <4 x i32> %t, i32 3 173 ret i32 %s 174} 175 176 177; The non-store form of extractps puts its result into a GPR. 178; This makes it suitable for an extract from a <4 x float> that 179; is bitcasted to i32, but unsuitable for much of anything else. 180 181define float @ext_1(<4 x float> %v) nounwind { 182; X86-SSE-LABEL: ext_1: 183; X86-SSE: ## %bb.0: 184; X86-SSE-NEXT: pushl %eax ## encoding: [0x50] 185; X86-SSE-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] 186; X86-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] 187; X86-SSE-NEXT: addss LCPI5_0, %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A] 188; X86-SSE-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 189; X86-SSE-NEXT: movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24] 190; X86-SSE-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 191; X86-SSE-NEXT: popl %eax ## encoding: [0x58] 192; X86-SSE-NEXT: retl ## encoding: [0xc3] 193; 194; X86-AVX1-LABEL: ext_1: 195; X86-AVX1: ## %bb.0: 196; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50] 197; X86-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 198; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 199; X86-AVX1-NEXT: vaddss LCPI5_0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] 200; X86-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 201; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] 202; X86-AVX1-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 203; X86-AVX1-NEXT: popl %eax ## encoding: [0x58] 204; X86-AVX1-NEXT: retl ## encoding: [0xc3] 205; 206; X86-AVX512-LABEL: ext_1: 207; X86-AVX512: ## %bb.0: 208; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] 209; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 210; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 211; X86-AVX512-NEXT: vaddss LCPI5_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] 212; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 213; X86-AVX512-NEXT: vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24] 214; X86-AVX512-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 215; X86-AVX512-NEXT: popl %eax ## encoding: [0x58] 216; X86-AVX512-NEXT: retl ## encoding: [0xc3] 217; 218; X64-SSE-LABEL: ext_1: 219; X64-SSE: ## %bb.0: 220; X64-SSE-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] 221; X64-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] 222; X64-SSE-NEXT: addss {{.*}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A] 223; X64-SSE-NEXT: ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte 224; X64-SSE-NEXT: retq ## encoding: [0xc3] 225; 226; X64-AVX1-LABEL: ext_1: 227; X64-AVX1: ## %bb.0: 228; X64-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 229; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 230; X64-AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] 231; X64-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte 232; X64-AVX1-NEXT: retq ## encoding: [0xc3] 233; 234; X64-AVX512-LABEL: ext_1: 235; X64-AVX512: ## %bb.0: 236; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 237; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 238; X64-AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] 239; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte 240; X64-AVX512-NEXT: retq ## encoding: [0xc3] 241 %s = extractelement <4 x float> %v, i32 3 242 %t = fadd float %s, 1.0 243 ret float %t 244} 245 246define float @ext_2(<4 x float> %v) nounwind { 247; X86-SSE-LABEL: ext_2: 248; X86-SSE: ## %bb.0: 249; X86-SSE-NEXT: pushl %eax ## encoding: [0x50] 250; X86-SSE-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] 251; X86-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] 252; X86-SSE-NEXT: movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24] 253; X86-SSE-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 254; X86-SSE-NEXT: popl %eax ## encoding: [0x58] 255; X86-SSE-NEXT: retl ## encoding: [0xc3] 256; 257; X86-AVX1-LABEL: ext_2: 258; X86-AVX1: ## %bb.0: 259; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50] 260; X86-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 261; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 262; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] 263; X86-AVX1-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 264; X86-AVX1-NEXT: popl %eax ## encoding: [0x58] 265; X86-AVX1-NEXT: retl ## encoding: [0xc3] 266; 267; X86-AVX512-LABEL: ext_2: 268; X86-AVX512: ## %bb.0: 269; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] 270; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 271; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 272; X86-AVX512-NEXT: vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24] 273; X86-AVX512-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 274; X86-AVX512-NEXT: popl %eax ## encoding: [0x58] 275; X86-AVX512-NEXT: retl ## encoding: [0xc3] 276; 277; X64-SSE-LABEL: ext_2: 278; X64-SSE: ## %bb.0: 279; X64-SSE-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] 280; X64-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] 281; X64-SSE-NEXT: retq ## encoding: [0xc3] 282; 283; X64-AVX1-LABEL: ext_2: 284; X64-AVX1: ## %bb.0: 285; X64-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 286; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 287; X64-AVX1-NEXT: retq ## encoding: [0xc3] 288; 289; X64-AVX512-LABEL: ext_2: 290; X64-AVX512: ## %bb.0: 291; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 292; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 293; X64-AVX512-NEXT: retq ## encoding: [0xc3] 294 %s = extractelement <4 x float> %v, i32 3 295 ret float %s 296} 297 298define i32 @ext_3(<4 x i32> %v) nounwind { 299; SSE-LABEL: ext_3: 300; SSE: ## %bb.0: 301; SSE-NEXT: extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03] 302; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 303; 304; AVX1-LABEL: ext_3: 305; AVX1: ## %bb.0: 306; AVX1-NEXT: vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 307; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 308; 309; AVX512-LABEL: ext_3: 310; AVX512: ## %bb.0: 311; AVX512-NEXT: vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 312; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 313 %i = extractelement <4 x i32> %v, i32 3 314 ret i32 %i 315} 316 317define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind { 318; SSE-LABEL: insertps_1: 319; SSE: ## %bb.0: 320; SSE-NEXT: insertps $21, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x15] 321; SSE-NEXT: ## xmm0 = zero,xmm1[0],zero,xmm0[3] 322; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 323; 324; AVX1-LABEL: insertps_1: 325; AVX1: ## %bb.0: 326; AVX1-NEXT: vinsertps $21, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15] 327; AVX1-NEXT: ## xmm0 = zero,xmm1[0],zero,xmm0[3] 328; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 329; 330; AVX512-LABEL: insertps_1: 331; AVX512: ## %bb.0: 332; AVX512-NEXT: vinsertps $21, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15] 333; AVX512-NEXT: ## xmm0 = zero,xmm1[0],zero,xmm0[3] 334; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 335 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone 336 ret <4 x float> %tmp1 337} 338 339declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone 340 341; When optimizing for speed, prefer blendps over insertps even if it means we have to 342; generate a separate movss to load the scalar operand. 343define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind { 344; X86-SSE-LABEL: blendps_not_insertps_1: 345; X86-SSE: ## %bb.0: 346; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04] 347; X86-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero 348; X86-SSE-NEXT: blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01] 349; X86-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 350; X86-SSE-NEXT: retl ## encoding: [0xc3] 351; 352; X86-AVX1-LABEL: blendps_not_insertps_1: 353; X86-AVX1: ## %bb.0: 354; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] 355; X86-AVX1-NEXT: ## xmm1 = mem[0],zero,zero,zero 356; X86-AVX1-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] 357; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 358; X86-AVX1-NEXT: retl ## encoding: [0xc3] 359; 360; X86-AVX512-LABEL: blendps_not_insertps_1: 361; X86-AVX512: ## %bb.0: 362; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] 363; X86-AVX512-NEXT: ## xmm1 = mem[0],zero,zero,zero 364; X86-AVX512-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] 365; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 366; X86-AVX512-NEXT: retl ## encoding: [0xc3] 367; 368; X64-SSE-LABEL: blendps_not_insertps_1: 369; X64-SSE: ## %bb.0: 370; X64-SSE-NEXT: blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01] 371; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 372; X64-SSE-NEXT: retq ## encoding: [0xc3] 373; 374; X64-AVX-LABEL: blendps_not_insertps_1: 375; X64-AVX: ## %bb.0: 376; X64-AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] 377; X64-AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 378; X64-AVX-NEXT: retq ## encoding: [0xc3] 379 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 380 ret <4 x float> %tmp1 381} 382 383; When optimizing for size, generate an insertps if there's a load fold opportunity. 384; The difference between i386 and x86-64 ABIs for the float operand means we should 385; generate an insertps for X86 but not for X64! 386define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind { 387; X86-SSE-LABEL: insertps_or_blendps: 388; X86-SSE: ## %bb.0: 389; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04] 390; X86-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero 391; X86-SSE-NEXT: movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1] 392; X86-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 393; X86-SSE-NEXT: retl ## encoding: [0xc3] 394; 395; X86-AVX1-LABEL: insertps_or_blendps: 396; X86-AVX1: ## %bb.0: 397; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] 398; X86-AVX1-NEXT: ## xmm1 = mem[0],zero,zero,zero 399; X86-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] 400; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 401; X86-AVX1-NEXT: retl ## encoding: [0xc3] 402; 403; X86-AVX512-LABEL: insertps_or_blendps: 404; X86-AVX512: ## %bb.0: 405; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] 406; X86-AVX512-NEXT: ## xmm1 = mem[0],zero,zero,zero 407; X86-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] 408; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 409; X86-AVX512-NEXT: retl ## encoding: [0xc3] 410; 411; X64-SSE-LABEL: insertps_or_blendps: 412; X64-SSE: ## %bb.0: 413; X64-SSE-NEXT: movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1] 414; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 415; X64-SSE-NEXT: retq ## encoding: [0xc3] 416; 417; X64-AVX1-LABEL: insertps_or_blendps: 418; X64-AVX1: ## %bb.0: 419; X64-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] 420; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 421; X64-AVX1-NEXT: retq ## encoding: [0xc3] 422; 423; X64-AVX512-LABEL: insertps_or_blendps: 424; X64-AVX512: ## %bb.0: 425; X64-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] 426; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 427; X64-AVX512-NEXT: retq ## encoding: [0xc3] 428 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 429 ret <4 x float> %tmp1 430} 431 432; An insert into the low 32-bits of a vector from the low 32-bits of another vector 433; is always just a blendps because blendps is never more expensive than insertps. 434define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind { 435; SSE-LABEL: blendps_not_insertps_2: 436; SSE: ## %bb.0: 437; SSE-NEXT: blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01] 438; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 439; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 440; 441; AVX-LABEL: blendps_not_insertps_2: 442; AVX: ## %bb.0: 443; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] 444; AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 445; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 446 %tmp2 = extractelement <4 x float> %t2, i32 0 447 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 448 ret <4 x float> %tmp1 449} 450 451define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind { 452; SSE-LABEL: ptestz_1: 453; SSE: ## %bb.0: 454; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 455; SSE-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1] 456; SSE-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] 457; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 458; 459; AVX-LABEL: ptestz_1: 460; AVX: ## %bb.0: 461; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 462; AVX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] 463; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] 464; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 465 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 466 ret i32 %tmp1 467} 468 469define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind { 470; SSE-LABEL: ptestz_2: 471; SSE: ## %bb.0: 472; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 473; SSE-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1] 474; SSE-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] 475; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 476; 477; AVX-LABEL: ptestz_2: 478; AVX: ## %bb.0: 479; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 480; AVX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] 481; AVX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] 482; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 483 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 484 ret i32 %tmp1 485} 486 487define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind { 488; SSE-LABEL: ptestz_3: 489; SSE: ## %bb.0: 490; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 491; SSE-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1] 492; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] 493; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 494; 495; AVX-LABEL: ptestz_3: 496; AVX: ## %bb.0: 497; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 498; AVX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] 499; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] 500; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 501 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 502 ret i32 %tmp1 503} 504 505declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone 506declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 507declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone 508 509; This used to compile to insertps $0 + insertps $16. insertps $0 is always 510; pointless. 511define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { 512; SSE-LABEL: buildvector: 513; SSE: ## %bb.0: ## %entry 514; SSE-NEXT: movshdup %xmm0, %xmm2 ## encoding: [0xf3,0x0f,0x16,0xd0] 515; SSE-NEXT: ## xmm2 = xmm0[1,1,3,3] 516; SSE-NEXT: movshdup %xmm1, %xmm3 ## encoding: [0xf3,0x0f,0x16,0xd9] 517; SSE-NEXT: ## xmm3 = xmm1[1,1,3,3] 518; SSE-NEXT: addss %xmm2, %xmm3 ## encoding: [0xf3,0x0f,0x58,0xda] 519; SSE-NEXT: addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1] 520; SSE-NEXT: insertps $16, %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc3,0x10] 521; SSE-NEXT: ## xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 522; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 523; 524; AVX1-LABEL: buildvector: 525; AVX1: ## %bb.0: ## %entry 526; AVX1-NEXT: vmovshdup %xmm0, %xmm2 ## encoding: [0xc5,0xfa,0x16,0xd0] 527; AVX1-NEXT: ## xmm2 = xmm0[1,1,3,3] 528; AVX1-NEXT: vmovshdup %xmm1, %xmm3 ## encoding: [0xc5,0xfa,0x16,0xd9] 529; AVX1-NEXT: ## xmm3 = xmm1[1,1,3,3] 530; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 ## encoding: [0xc5,0xea,0x58,0xd3] 531; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1] 532; AVX1-NEXT: vinsertps $16, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10] 533; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 534; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 535; 536; AVX512-LABEL: buildvector: 537; AVX512: ## %bb.0: ## %entry 538; AVX512-NEXT: vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0] 539; AVX512-NEXT: ## xmm2 = xmm0[1,1,3,3] 540; AVX512-NEXT: vmovshdup %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd9] 541; AVX512-NEXT: ## xmm3 = xmm1[1,1,3,3] 542; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x58,0xd3] 543; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1] 544; AVX512-NEXT: vinsertps $16, %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10] 545; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 546; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 547entry: 548 %tmp7 = extractelement <2 x float> %A, i32 0 549 %tmp5 = extractelement <2 x float> %A, i32 1 550 %tmp3 = extractelement <2 x float> %B, i32 0 551 %tmp1 = extractelement <2 x float> %B, i32 1 552 %add.r = fadd float %tmp7, %tmp3 553 %add.i = fadd float %tmp5, %tmp1 554 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 555 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 556 ret <2 x float> %tmp9 557} 558 559define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 560; X86-SSE-LABEL: insertps_from_shufflevector_1: 561; X86-SSE: ## %bb.0: ## %entry 562; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 563; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] 564; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 565; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 566; X86-SSE-NEXT: retl ## encoding: [0xc3] 567; 568; X86-AVX1-LABEL: insertps_from_shufflevector_1: 569; X86-AVX1: ## %bb.0: ## %entry 570; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 571; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] 572; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 573; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 574; X86-AVX1-NEXT: retl ## encoding: [0xc3] 575; 576; X86-AVX512-LABEL: insertps_from_shufflevector_1: 577; X86-AVX512: ## %bb.0: ## %entry 578; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 579; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] 580; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 581; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 582; X86-AVX512-NEXT: retl ## encoding: [0xc3] 583; 584; X64-SSE-LABEL: insertps_from_shufflevector_1: 585; X64-SSE: ## %bb.0: ## %entry 586; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] 587; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 588; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 589; X64-SSE-NEXT: retq ## encoding: [0xc3] 590; 591; X64-AVX1-LABEL: insertps_from_shufflevector_1: 592; X64-AVX1: ## %bb.0: ## %entry 593; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] 594; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 595; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 596; X64-AVX1-NEXT: retq ## encoding: [0xc3] 597; 598; X64-AVX512-LABEL: insertps_from_shufflevector_1: 599; X64-AVX512: ## %bb.0: ## %entry 600; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] 601; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 602; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 603; X64-AVX512-NEXT: retq ## encoding: [0xc3] 604entry: 605 %0 = load <4 x float>, <4 x float>* %pb, align 16 606 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 607 ret <4 x float> %vecinit6 608} 609 610define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { 611; SSE-LABEL: insertps_from_shufflevector_2: 612; SSE: ## %bb.0: ## %entry 613; SSE-NEXT: insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60] 614; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 615; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 616; 617; AVX1-LABEL: insertps_from_shufflevector_2: 618; AVX1: ## %bb.0: ## %entry 619; AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 620; AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 621; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 622; 623; AVX512-LABEL: insertps_from_shufflevector_2: 624; AVX512: ## %bb.0: ## %entry 625; AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 626; AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 627; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 628entry: 629 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 630 ret <4 x float> %vecinit6 631} 632 633; For loading an i32 from memory into an xmm register we use pinsrd 634; instead of insertps 635define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { 636; X86-SSE-LABEL: pinsrd_from_shufflevector_i32: 637; X86-SSE: ## %bb.0: ## %entry 638; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 639; X86-SSE-NEXT: pshufd $0, (%eax), %xmm1 ## encoding: [0x66,0x0f,0x70,0x08,0x00] 640; X86-SSE-NEXT: ## xmm1 = mem[0,0,0,0] 641; X86-SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] 642; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 643; X86-SSE-NEXT: retl ## encoding: [0xc3] 644; 645; X86-AVX1-LABEL: pinsrd_from_shufflevector_i32: 646; X86-AVX1: ## %bb.0: ## %entry 647; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 648; X86-AVX1-NEXT: vpermilps $0, (%eax), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x08,0x00] 649; X86-AVX1-NEXT: ## xmm1 = mem[0,0,0,0] 650; X86-AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 651; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 652; X86-AVX1-NEXT: retl ## encoding: [0xc3] 653; 654; X86-AVX512-LABEL: pinsrd_from_shufflevector_i32: 655; X86-AVX512: ## %bb.0: ## %entry 656; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 657; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08] 658; X86-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 659; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 660; X86-AVX512-NEXT: retl ## encoding: [0xc3] 661; 662; X64-SSE-LABEL: pinsrd_from_shufflevector_i32: 663; X64-SSE: ## %bb.0: ## %entry 664; X64-SSE-NEXT: pshufd $0, (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x70,0x0f,0x00] 665; X64-SSE-NEXT: ## xmm1 = mem[0,0,0,0] 666; X64-SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] 667; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 668; X64-SSE-NEXT: retq ## encoding: [0xc3] 669; 670; X64-AVX1-LABEL: pinsrd_from_shufflevector_i32: 671; X64-AVX1: ## %bb.0: ## %entry 672; X64-AVX1-NEXT: vpermilps $0, (%rdi), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x0f,0x00] 673; X64-AVX1-NEXT: ## xmm1 = mem[0,0,0,0] 674; X64-AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 675; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 676; X64-AVX1-NEXT: retq ## encoding: [0xc3] 677; 678; X64-AVX512-LABEL: pinsrd_from_shufflevector_i32: 679; X64-AVX512: ## %bb.0: ## %entry 680; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f] 681; X64-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 682; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 683; X64-AVX512-NEXT: retq ## encoding: [0xc3] 684entry: 685 %0 = load <4 x i32>, <4 x i32>* %pb, align 16 686 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 687 ret <4 x i32> %vecinit6 688} 689 690define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { 691; SSE-LABEL: insertps_from_shufflevector_i32_2: 692; SSE: ## %bb.0: ## %entry 693; SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] 694; SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] 695; SSE-NEXT: pblendw $12, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x0c] 696; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 697; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 698; 699; AVX1-LABEL: insertps_from_shufflevector_i32_2: 700; AVX1: ## %bb.0: ## %entry 701; AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] 702; AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] 703; AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 704; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 705; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 706; 707; AVX512-LABEL: insertps_from_shufflevector_i32_2: 708; AVX512: ## %bb.0: ## %entry 709; AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] 710; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] 711; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 712; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 713; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 714entry: 715 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> 716 ret <4 x i32> %vecinit6 717} 718 719define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { 720; X86-SSE-LABEL: insertps_from_load_ins_elt_undef: 721; X86-SSE: ## %bb.0: 722; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 723; X86-SSE-NEXT: insertps $16, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x10] 724; X86-SSE-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 725; X86-SSE-NEXT: retl ## encoding: [0xc3] 726; 727; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef: 728; X86-AVX1: ## %bb.0: 729; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 730; X86-AVX1-NEXT: vinsertps $16, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10] 731; X86-AVX1-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 732; X86-AVX1-NEXT: retl ## encoding: [0xc3] 733; 734; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef: 735; X86-AVX512: ## %bb.0: 736; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 737; X86-AVX512-NEXT: vinsertps $16, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10] 738; X86-AVX512-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 739; X86-AVX512-NEXT: retl ## encoding: [0xc3] 740; 741; X64-SSE-LABEL: insertps_from_load_ins_elt_undef: 742; X64-SSE: ## %bb.0: 743; X64-SSE-NEXT: insertps $16, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x10] 744; X64-SSE-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 745; X64-SSE-NEXT: retq ## encoding: [0xc3] 746; 747; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef: 748; X64-AVX1: ## %bb.0: 749; X64-AVX1-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10] 750; X64-AVX1-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 751; X64-AVX1-NEXT: retq ## encoding: [0xc3] 752; 753; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef: 754; X64-AVX512: ## %bb.0: 755; X64-AVX512-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10] 756; X64-AVX512-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 757; X64-AVX512-NEXT: retq ## encoding: [0xc3] 758 %1 = load float, float* %b, align 4 759 %2 = insertelement <4 x float> undef, float %1, i32 0 760 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 761 ret <4 x float> %result 762} 763 764; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr 765define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { 766; X86-SSE-LABEL: insertps_from_load_ins_elt_undef_i32: 767; X86-SSE: ## %bb.0: 768; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 769; X86-SSE-NEXT: pinsrd $2, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x00,0x02] 770; X86-SSE-NEXT: retl ## encoding: [0xc3] 771; 772; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32: 773; X86-AVX1: ## %bb.0: 774; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 775; X86-AVX1-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02] 776; X86-AVX1-NEXT: retl ## encoding: [0xc3] 777; 778; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32: 779; X86-AVX512: ## %bb.0: 780; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 781; X86-AVX512-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02] 782; X86-AVX512-NEXT: retl ## encoding: [0xc3] 783; 784; X64-SSE-LABEL: insertps_from_load_ins_elt_undef_i32: 785; X64-SSE: ## %bb.0: 786; X64-SSE-NEXT: pinsrd $2, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x07,0x02] 787; X64-SSE-NEXT: retq ## encoding: [0xc3] 788; 789; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32: 790; X64-AVX1: ## %bb.0: 791; X64-AVX1-NEXT: vpinsrd $2, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02] 792; X64-AVX1-NEXT: retq ## encoding: [0xc3] 793; 794; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32: 795; X64-AVX512: ## %bb.0: 796; X64-AVX512-NEXT: vpinsrd $2, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02] 797; X64-AVX512-NEXT: retq ## encoding: [0xc3] 798 %1 = load i32, i32* %b, align 4 799 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 800 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 801 ret <4 x i32> %result 802} 803 804;;;;;; Shuffles optimizable with a single insertps or blend instruction 805define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { 806; SSE-LABEL: shuf_XYZ0: 807; SSE: ## %bb.0: 808; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 809; SSE-NEXT: blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08] 810; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 811; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 812; 813; AVX1-LABEL: shuf_XYZ0: 814; AVX1: ## %bb.0: 815; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 816; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 817; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 818; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 819; 820; AVX512-LABEL: shuf_XYZ0: 821; AVX512: ## %bb.0: 822; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 823; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 824; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 825; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 826 %vecext = extractelement <4 x float> %x, i32 0 827 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 828 %vecext1 = extractelement <4 x float> %x, i32 1 829 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 830 %vecext3 = extractelement <4 x float> %x, i32 2 831 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 832 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 833 ret <4 x float> %vecinit5 834} 835 836define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { 837; SSE-LABEL: shuf_XY00: 838; SSE: ## %bb.0: 839; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0] 840; SSE-NEXT: ## xmm0 = xmm0[0],zero 841; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 842; 843; AVX1-LABEL: shuf_XY00: 844; AVX1: ## %bb.0: 845; AVX1-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0] 846; AVX1-NEXT: ## xmm0 = xmm0[0],zero 847; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 848; 849; AVX512-LABEL: shuf_XY00: 850; AVX512: ## %bb.0: 851; AVX512-NEXT: vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0] 852; AVX512-NEXT: ## xmm0 = xmm0[0],zero 853; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 854 %vecext = extractelement <4 x float> %x, i32 0 855 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 856 %vecext1 = extractelement <4 x float> %x, i32 1 857 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 858 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 859 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 860 ret <4 x float> %vecinit4 861} 862 863define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { 864; SSE-LABEL: shuf_XYY0: 865; SSE: ## %bb.0: 866; SSE-NEXT: insertps $104, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x68] 867; SSE-NEXT: ## xmm0 = xmm0[0,1,1],zero 868; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 869; 870; AVX1-LABEL: shuf_XYY0: 871; AVX1: ## %bb.0: 872; AVX1-NEXT: vinsertps $104, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68] 873; AVX1-NEXT: ## xmm0 = xmm0[0,1,1],zero 874; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 875; 876; AVX512-LABEL: shuf_XYY0: 877; AVX512: ## %bb.0: 878; AVX512-NEXT: vinsertps $104, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68] 879; AVX512-NEXT: ## xmm0 = xmm0[0,1,1],zero 880; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 881 %vecext = extractelement <4 x float> %x, i32 0 882 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 883 %vecext1 = extractelement <4 x float> %x, i32 1 884 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 885 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2 886 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 887 ret <4 x float> %vecinit5 888} 889 890define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { 891; SSE-LABEL: shuf_XYW0: 892; SSE: ## %bb.0: 893; SSE-NEXT: insertps $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xe8] 894; SSE-NEXT: ## xmm0 = xmm0[0,1,3],zero 895; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 896; 897; AVX1-LABEL: shuf_XYW0: 898; AVX1: ## %bb.0: 899; AVX1-NEXT: vinsertps $232, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8] 900; AVX1-NEXT: ## xmm0 = xmm0[0,1,3],zero 901; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 902; 903; AVX512-LABEL: shuf_XYW0: 904; AVX512: ## %bb.0: 905; AVX512-NEXT: vinsertps $232, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8] 906; AVX512-NEXT: ## xmm0 = xmm0[0,1,3],zero 907; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 908 %vecext = extractelement <4 x float> %x, i32 0 909 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 910 %vecext1 = extractelement <4 x float> %x, i32 1 911 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 912 %vecext2 = extractelement <4 x float> %x, i32 3 913 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2 914 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 915 ret <4 x float> %vecinit4 916} 917 918define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { 919; SSE-LABEL: shuf_W00W: 920; SSE: ## %bb.0: 921; SSE-NEXT: insertps $198, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xc6] 922; SSE-NEXT: ## xmm0 = xmm0[3],zero,zero,xmm0[3] 923; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 924; 925; AVX1-LABEL: shuf_W00W: 926; AVX1: ## %bb.0: 927; AVX1-NEXT: vinsertps $198, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6] 928; AVX1-NEXT: ## xmm0 = xmm0[3],zero,zero,xmm0[3] 929; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 930; 931; AVX512-LABEL: shuf_W00W: 932; AVX512: ## %bb.0: 933; AVX512-NEXT: vinsertps $198, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6] 934; AVX512-NEXT: ## xmm0 = xmm0[3],zero,zero,xmm0[3] 935; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 936 %vecext = extractelement <4 x float> %x, i32 3 937 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 938 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 939 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 940 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3 941 ret <4 x float> %vecinit4 942} 943 944define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { 945; SSE-LABEL: shuf_X00A: 946; SSE: ## %bb.0: 947; SSE-NEXT: insertps $54, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x36] 948; SSE-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] 949; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 950; 951; AVX1-LABEL: shuf_X00A: 952; AVX1: ## %bb.0: 953; AVX1-NEXT: vinsertps $54, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36] 954; AVX1-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] 955; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 956; 957; AVX512-LABEL: shuf_X00A: 958; AVX512: ## %bb.0: 959; AVX512-NEXT: vinsertps $54, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36] 960; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] 961; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 962 %vecext = extractelement <4 x float> %x, i32 0 963 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 964 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 965 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 966 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 967 ret <4 x float> %vecinit4 968} 969 970define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { 971; SSE-LABEL: shuf_X00X: 972; SSE: ## %bb.0: 973; SSE-NEXT: insertps $54, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x36] 974; SSE-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] 975; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 976; 977; AVX1-LABEL: shuf_X00X: 978; AVX1: ## %bb.0: 979; AVX1-NEXT: vinsertps $54, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36] 980; AVX1-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] 981; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 982; 983; AVX512-LABEL: shuf_X00X: 984; AVX512: ## %bb.0: 985; AVX512-NEXT: vinsertps $54, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36] 986; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] 987; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 988 %vecext = extractelement <4 x float> %x, i32 0 989 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 990 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 991 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 992 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 993 ret <4 x float> %vecinit4 994} 995 996define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { 997; SSE-LABEL: shuf_X0YC: 998; SSE: ## %bb.0: 999; SSE-NEXT: xorps %xmm2, %xmm2 ## encoding: [0x0f,0x57,0xd2] 1000; SSE-NEXT: unpcklps %xmm2, %xmm0 ## encoding: [0x0f,0x14,0xc2] 1001; SSE-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1002; SSE-NEXT: insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0] 1003; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] 1004; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1005; 1006; AVX1-LABEL: shuf_X0YC: 1007; AVX1: ## %bb.0: 1008; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] 1009; AVX1-NEXT: vunpcklps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x14,0xc2] 1010; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1011; AVX1-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] 1012; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] 1013; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1014; 1015; AVX512-LABEL: shuf_X0YC: 1016; AVX512: ## %bb.0: 1017; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2] 1018; AVX512-NEXT: vunpcklps %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc2] 1019; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1020; AVX512-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] 1021; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] 1022; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1023 %vecext = extractelement <4 x float> %x, i32 0 1024 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1025 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 1026 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 1027 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 1028 ret <4 x float> %vecinit5 1029} 1030 1031define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { 1032; SSE-LABEL: i32_shuf_XYZ0: 1033; SSE: ## %bb.0: 1034; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 1035; SSE-NEXT: blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08] 1036; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1037; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1038; 1039; AVX1-LABEL: i32_shuf_XYZ0: 1040; AVX1: ## %bb.0: 1041; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1042; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1043; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1044; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1045; 1046; AVX512-LABEL: i32_shuf_XYZ0: 1047; AVX512: ## %bb.0: 1048; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1049; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1050; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1051; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1052 %vecext = extractelement <4 x i32> %x, i32 0 1053 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1054 %vecext1 = extractelement <4 x i32> %x, i32 1 1055 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 1056 %vecext3 = extractelement <4 x i32> %x, i32 2 1057 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 1058 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 1059 ret <4 x i32> %vecinit5 1060} 1061 1062define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { 1063; SSE-LABEL: i32_shuf_XY00: 1064; SSE: ## %bb.0: 1065; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0] 1066; SSE-NEXT: ## xmm0 = xmm0[0],zero 1067; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1068; 1069; AVX1-LABEL: i32_shuf_XY00: 1070; AVX1: ## %bb.0: 1071; AVX1-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0] 1072; AVX1-NEXT: ## xmm0 = xmm0[0],zero 1073; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1074; 1075; AVX512-LABEL: i32_shuf_XY00: 1076; AVX512: ## %bb.0: 1077; AVX512-NEXT: vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0] 1078; AVX512-NEXT: ## xmm0 = xmm0[0],zero 1079; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1080 %vecext = extractelement <4 x i32> %x, i32 0 1081 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1082 %vecext1 = extractelement <4 x i32> %x, i32 1 1083 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 1084 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 1085 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 1086 ret <4 x i32> %vecinit4 1087} 1088 1089define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { 1090; SSE-LABEL: i32_shuf_XYY0: 1091; SSE: ## %bb.0: 1092; SSE-NEXT: pshufd $212, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xd4] 1093; SSE-NEXT: ## xmm1 = xmm0[0,1,1,3] 1094; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] 1095; SSE-NEXT: pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f] 1096; SSE-NEXT: ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 1097; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1098; 1099; AVX1-LABEL: i32_shuf_XYY0: 1100; AVX1: ## %bb.0: 1101; AVX1-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] 1102; AVX1-NEXT: ## xmm0 = xmm0[0,1,1,3] 1103; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1104; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1105; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1106; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1107; 1108; AVX512-LABEL: i32_shuf_XYY0: 1109; AVX512: ## %bb.0: 1110; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] 1111; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] 1112; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1113; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1114; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1115; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1116 %vecext = extractelement <4 x i32> %x, i32 0 1117 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1118 %vecext1 = extractelement <4 x i32> %x, i32 1 1119 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 1120 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2 1121 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 1122 ret <4 x i32> %vecinit5 1123} 1124 1125define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { 1126; SSE-LABEL: i32_shuf_XYW0: 1127; SSE: ## %bb.0: 1128; SSE-NEXT: pshufd $244, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xf4] 1129; SSE-NEXT: ## xmm1 = xmm0[0,1,3,3] 1130; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] 1131; SSE-NEXT: pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f] 1132; SSE-NEXT: ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 1133; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1134; 1135; AVX1-LABEL: i32_shuf_XYW0: 1136; AVX1: ## %bb.0: 1137; AVX1-NEXT: vpermilps $244, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4] 1138; AVX1-NEXT: ## xmm0 = xmm0[0,1,3,3] 1139; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1140; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1141; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1142; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1143; 1144; AVX512-LABEL: i32_shuf_XYW0: 1145; AVX512: ## %bb.0: 1146; AVX512-NEXT: vpermilps $244, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4] 1147; AVX512-NEXT: ## xmm0 = xmm0[0,1,3,3] 1148; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1149; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1150; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1151; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1152 %vecext = extractelement <4 x i32> %x, i32 0 1153 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1154 %vecext1 = extractelement <4 x i32> %x, i32 1 1155 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 1156 %vecext2 = extractelement <4 x i32> %x, i32 3 1157 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2 1158 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 1159 ret <4 x i32> %vecinit4 1160} 1161 1162define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { 1163; SSE-LABEL: i32_shuf_W00W: 1164; SSE: ## %bb.0: 1165; SSE-NEXT: pshufd $255, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xff] 1166; SSE-NEXT: ## xmm1 = xmm0[3,3,3,3] 1167; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] 1168; SSE-NEXT: pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3] 1169; SSE-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 1170; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1171; 1172; AVX1-LABEL: i32_shuf_W00W: 1173; AVX1: ## %bb.0: 1174; AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 1175; AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 1176; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1177; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] 1178; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1179; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1180; 1181; AVX512-LABEL: i32_shuf_W00W: 1182; AVX512: ## %bb.0: 1183; AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] 1184; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 1185; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1186; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] 1187; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1188; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1189 %vecext = extractelement <4 x i32> %x, i32 3 1190 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1191 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 1192 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 1193 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3 1194 ret <4 x i32> %vecinit4 1195} 1196 1197define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { 1198; SSE-LABEL: i32_shuf_X00A: 1199; SSE: ## %bb.0: 1200; SSE-NEXT: pxor %xmm2, %xmm2 ## encoding: [0x66,0x0f,0xef,0xd2] 1201; SSE-NEXT: pblendw $252, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0xfc] 1202; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 1203; SSE-NEXT: pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00] 1204; SSE-NEXT: ## xmm1 = xmm1[0,0,0,0] 1205; SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] 1206; SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 1207; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1208; 1209; AVX1-LABEL: i32_shuf_X00A: 1210; AVX1: ## %bb.0: 1211; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] 1212; AVX1-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] 1213; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] 1214; AVX1-NEXT: vpermilps $0, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x00] 1215; AVX1-NEXT: ## xmm1 = xmm1[0,0,0,0] 1216; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1217; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1218; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1219; 1220; AVX512-LABEL: i32_shuf_X00A: 1221; AVX512: ## %bb.0: 1222; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] 1223; AVX512-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] 1224; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] 1225; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] 1226; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1227; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1228; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1229 %vecext = extractelement <4 x i32> %x, i32 0 1230 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1231 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 1232 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 1233 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 1234 ret <4 x i32> %vecinit4 1235} 1236 1237define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { 1238; SSE-LABEL: i32_shuf_X00X: 1239; SSE: ## %bb.0: 1240; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9] 1241; SSE-NEXT: pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00] 1242; SSE-NEXT: ## xmm0 = xmm0[0,0,0,0] 1243; SSE-NEXT: pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c] 1244; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 1245; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1246; 1247; AVX1-LABEL: i32_shuf_X00X: 1248; AVX1: ## %bb.0: 1249; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1250; AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] 1251; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0] 1252; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] 1253; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1254; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1255; 1256; AVX512-LABEL: i32_shuf_X00X: 1257; AVX512: ## %bb.0: 1258; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1259; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] 1260; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] 1261; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1262; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1263 %vecext = extractelement <4 x i32> %x, i32 0 1264 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1265 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 1266 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 1267 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 1268 ret <4 x i32> %vecinit4 1269} 1270 1271define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { 1272; SSE-LABEL: i32_shuf_X0YC: 1273; SSE: ## %bb.0: 1274; SSE-NEXT: pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0] 1275; SSE-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero 1276; SSE-NEXT: pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa] 1277; SSE-NEXT: ## xmm0 = xmm1[2,2,2,2] 1278; SSE-NEXT: pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f] 1279; SSE-NEXT: ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 1280; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1281; 1282; AVX1-LABEL: i32_shuf_X0YC: 1283; AVX1: ## %bb.0: 1284; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0] 1285; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero 1286; AVX1-NEXT: vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa] 1287; AVX1-NEXT: ## xmm1 = xmm1[2,2,2,2] 1288; AVX1-NEXT: vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0] 1289; AVX1-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 1290; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1291; 1292; AVX512-LABEL: i32_shuf_X0YC: 1293; AVX512: ## %bb.0: 1294; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0] 1295; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero 1296; AVX512-NEXT: vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa] 1297; AVX512-NEXT: ## xmm1 = xmm1[2,2,2,2] 1298; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] 1299; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1300; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1301 %vecext = extractelement <4 x i32> %x, i32 0 1302 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1303 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 1304 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 1305 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 1306 ret <4 x i32> %vecinit5 1307} 1308 1309;; Test for a bug in the first implementation of LowerBuildVectorv4X86 1310define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { 1311; SSE-LABEL: test_insertps_no_undef: 1312; SSE: ## %bb.0: 1313; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 1314; SSE-NEXT: blendps $7, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc8,0x07] 1315; SSE-NEXT: ## xmm1 = xmm0[0,1,2],xmm1[3] 1316; SSE-NEXT: maxps %xmm1, %xmm0 ## encoding: [0x0f,0x5f,0xc1] 1317; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1318; 1319; AVX1-LABEL: test_insertps_no_undef: 1320; AVX1: ## %bb.0: 1321; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1322; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08] 1323; AVX1-NEXT: ## xmm1 = xmm0[0,1,2],xmm1[3] 1324; AVX1-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1] 1325; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1326; 1327; AVX512-LABEL: test_insertps_no_undef: 1328; AVX512: ## %bb.0: 1329; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1330; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08] 1331; AVX512-NEXT: ## xmm1 = xmm0[0,1,2],xmm1[3] 1332; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1] 1333; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1334 %vecext = extractelement <4 x float> %x, i32 0 1335 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1336 %vecext1 = extractelement <4 x float> %x, i32 1 1337 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1338 %vecext3 = extractelement <4 x float> %x, i32 2 1339 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 1340 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 1341 %mask = fcmp olt <4 x float> %vecinit5, %x 1342 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 1343 ret <4 x float> %res 1344} 1345 1346define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { 1347; SSE-LABEL: blendvb_fallback: 1348; SSE: ## %bb.0: 1349; SSE-NEXT: psllw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x0f] 1350; SSE-NEXT: psraw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x0f] 1351; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x38,0x10,0xd1] 1352; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] 1353; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1354; 1355; AVX1-LABEL: blendvb_fallback: 1356; AVX1: ## %bb.0: 1357; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x0f] 1358; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x0f] 1359; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00] 1360; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1361; 1362; AVX512-LABEL: blendvb_fallback: 1363; AVX512: ## %bb.0: 1364; AVX512-NEXT: vpsllw $15, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x0f] 1365; AVX512-NEXT: vpmovw2m %xmm0, %k1 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc8] 1366; AVX512-NEXT: vpblendmw %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x66,0xc1] 1367; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1368 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y 1369 ret <8 x i16> %ret 1370} 1371 1372; On X86, account for the argument's move to registers 1373define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 1374; X86-SSE-LABEL: insertps_from_vector_load: 1375; X86-SSE: ## %bb.0: 1376; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1377; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] 1378; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 1379; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1380; X86-SSE-NEXT: retl ## encoding: [0xc3] 1381; 1382; X86-AVX1-LABEL: insertps_from_vector_load: 1383; X86-AVX1: ## %bb.0: 1384; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1385; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] 1386; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 1387; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1388; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1389; 1390; X86-AVX512-LABEL: insertps_from_vector_load: 1391; X86-AVX512: ## %bb.0: 1392; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1393; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] 1394; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 1395; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1396; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1397; 1398; X64-SSE-LABEL: insertps_from_vector_load: 1399; X64-SSE: ## %bb.0: 1400; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] 1401; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 1402; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1403; X64-SSE-NEXT: retq ## encoding: [0xc3] 1404; 1405; X64-AVX1-LABEL: insertps_from_vector_load: 1406; X64-AVX1: ## %bb.0: 1407; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] 1408; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 1409; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1410; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1411; 1412; X64-AVX512-LABEL: insertps_from_vector_load: 1413; X64-AVX512: ## %bb.0: 1414; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] 1415; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 1416; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1417; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1418 %1 = load <4 x float>, <4 x float>* %pb, align 16 1419 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) 1420 ret <4 x float> %2 1421} 1422 1423;; Use a non-zero CountS for insertps 1424;; Try to match a bit more of the instr, since we need the load's offset. 1425define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 1426; X86-SSE-LABEL: insertps_from_vector_load_offset: 1427; X86-SSE: ## %bb.0: 1428; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1429; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] 1430; X86-SSE-NEXT: insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60] 1431; X86-SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1432; X86-SSE-NEXT: retl ## encoding: [0xc3] 1433; 1434; X86-AVX1-LABEL: insertps_from_vector_load_offset: 1435; X86-AVX1: ## %bb.0: 1436; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1437; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] 1438; X86-AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 1439; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1440; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1441; 1442; X86-AVX512-LABEL: insertps_from_vector_load_offset: 1443; X86-AVX512: ## %bb.0: 1444; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1445; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] 1446; X86-AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 1447; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1448; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1449; 1450; X64-SSE-LABEL: insertps_from_vector_load_offset: 1451; X64-SSE: ## %bb.0: 1452; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] 1453; X64-SSE-NEXT: insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60] 1454; X64-SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1455; X64-SSE-NEXT: retq ## encoding: [0xc3] 1456; 1457; X64-AVX1-LABEL: insertps_from_vector_load_offset: 1458; X64-AVX1: ## %bb.0: 1459; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] 1460; X64-AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 1461; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1462; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1463; 1464; X64-AVX512-LABEL: insertps_from_vector_load_offset: 1465; X64-AVX512: ## %bb.0: 1466; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] 1467; X64-AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 1468; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1469; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1470 %1 = load <4 x float>, <4 x float>* %pb, align 16 1471 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) 1472 ret <4 x float> %2 1473} 1474 1475;; Try to match a bit more of the instr, since we need the load's offset. 1476define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { 1477; X86-SSE-LABEL: insertps_from_vector_load_offset_2: 1478; X86-SSE: ## %bb.0: 1479; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1480; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] 1481; X86-SSE-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] 1482; X86-SSE-NEXT: movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08] 1483; X86-SSE-NEXT: insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0] 1484; X86-SSE-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1485; X86-SSE-NEXT: retl ## encoding: [0xc3] 1486; 1487; X86-AVX1-LABEL: insertps_from_vector_load_offset_2: 1488; X86-AVX1: ## %bb.0: 1489; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1490; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] 1491; X86-AVX1-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] 1492; X86-AVX1-NEXT: vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08] 1493; X86-AVX1-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] 1494; X86-AVX1-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1495; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1496; 1497; X86-AVX512-LABEL: insertps_from_vector_load_offset_2: 1498; X86-AVX512: ## %bb.0: 1499; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1500; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] 1501; X86-AVX512-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] 1502; X86-AVX512-NEXT: vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08] 1503; X86-AVX512-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] 1504; X86-AVX512-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1505; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1506; 1507; X64-SSE-LABEL: insertps_from_vector_load_offset_2: 1508; X64-SSE: ## %bb.0: 1509; X64-SSE-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] 1510; X64-SSE-NEXT: movaps (%rdi,%rsi), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x37] 1511; X64-SSE-NEXT: insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0] 1512; X64-SSE-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1513; X64-SSE-NEXT: retq ## encoding: [0xc3] 1514; 1515; X64-AVX1-LABEL: insertps_from_vector_load_offset_2: 1516; X64-AVX1: ## %bb.0: 1517; X64-AVX1-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] 1518; X64-AVX1-NEXT: vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37] 1519; X64-AVX1-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] 1520; X64-AVX1-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1521; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1522; 1523; X64-AVX512-LABEL: insertps_from_vector_load_offset_2: 1524; X64-AVX512: ## %bb.0: 1525; X64-AVX512-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] 1526; X64-AVX512-NEXT: vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37] 1527; X64-AVX512-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] 1528; X64-AVX512-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1529; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1530 %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index 1531 %2 = load <4 x float>, <4 x float>* %1, align 16 1532 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) 1533 ret <4 x float> %3 1534} 1535 1536define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { 1537; X86-SSE-LABEL: insertps_from_broadcast_loadf32: 1538; X86-SSE: ## %bb.0: 1539; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1540; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1541; X86-SSE-NEXT: insertps $48, (%ecx,%eax,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0x81,0x30] 1542; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1543; X86-SSE-NEXT: retl ## encoding: [0xc3] 1544; 1545; X86-AVX1-LABEL: insertps_from_broadcast_loadf32: 1546; X86-AVX1: ## %bb.0: 1547; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1548; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1549; X86-AVX1-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30] 1550; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1551; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1552; 1553; X86-AVX512-LABEL: insertps_from_broadcast_loadf32: 1554; X86-AVX512: ## %bb.0: 1555; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1556; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1557; X86-AVX512-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30] 1558; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1559; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1560; 1561; X64-SSE-LABEL: insertps_from_broadcast_loadf32: 1562; X64-SSE: ## %bb.0: 1563; X64-SSE-NEXT: insertps $48, (%rdi,%rsi,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0xb7,0x30] 1564; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1565; X64-SSE-NEXT: retq ## encoding: [0xc3] 1566; 1567; X64-AVX1-LABEL: insertps_from_broadcast_loadf32: 1568; X64-AVX1: ## %bb.0: 1569; X64-AVX1-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30] 1570; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1571; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1572; 1573; X64-AVX512-LABEL: insertps_from_broadcast_loadf32: 1574; X64-AVX512: ## %bb.0: 1575; X64-AVX512-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30] 1576; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1577; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1578 %1 = getelementptr inbounds float, float* %fb, i64 %index 1579 %2 = load float, float* %1, align 4 1580 %3 = insertelement <4 x float> undef, float %2, i32 0 1581 %4 = insertelement <4 x float> %3, float %2, i32 1 1582 %5 = insertelement <4 x float> %4, float %2, i32 2 1583 %6 = insertelement <4 x float> %5, float %2, i32 3 1584 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 1585 ret <4 x float> %7 1586} 1587 1588define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { 1589; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32: 1590; X86-SSE: ## %bb.0: 1591; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1592; X86-SSE-NEXT: movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08] 1593; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 1594; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1595; X86-SSE-NEXT: retl ## encoding: [0xc3] 1596; 1597; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: 1598; X86-AVX1: ## %bb.0: 1599; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1600; X86-AVX1-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] 1601; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1602; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1603; 1604; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32: 1605; X86-AVX512: ## %bb.0: 1606; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1607; X86-AVX512-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] 1608; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1609; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1610; 1611; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32: 1612; X64-SSE: ## %bb.0: 1613; X64-SSE-NEXT: movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f] 1614; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 1615; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1616; X64-SSE-NEXT: retq ## encoding: [0xc3] 1617; 1618; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: 1619; X64-AVX1: ## %bb.0: 1620; X64-AVX1-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] 1621; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1622; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1623; 1624; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32: 1625; X64-AVX512: ## %bb.0: 1626; X64-AVX512-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] 1627; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1628; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1629 %1 = load <4 x float>, <4 x float>* %b, align 4 1630 %2 = extractelement <4 x float> %1, i32 0 1631 %3 = insertelement <4 x float> undef, float %2, i32 0 1632 %4 = insertelement <4 x float> %3, float %2, i32 1 1633 %5 = insertelement <4 x float> %4, float %2, i32 2 1634 %6 = insertelement <4 x float> %5, float %2, i32 3 1635 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 1636 ret <4 x float> %7 1637} 1638 1639define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { 1640; X86-SSE-LABEL: insertps_from_broadcast_multiple_use: 1641; X86-SSE: ## %bb.0: 1642; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1643; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1644; X86-SSE-NEXT: movss (%ecx,%eax,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0x81] 1645; X86-SSE-NEXT: ## xmm4 = mem[0],zero,zero,zero 1646; X86-SSE-NEXT: insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30] 1647; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] 1648; X86-SSE-NEXT: insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30] 1649; X86-SSE-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] 1650; X86-SSE-NEXT: addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1] 1651; X86-SSE-NEXT: insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30] 1652; X86-SSE-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[0] 1653; X86-SSE-NEXT: insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30] 1654; X86-SSE-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[0] 1655; X86-SSE-NEXT: addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda] 1656; X86-SSE-NEXT: addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3] 1657; X86-SSE-NEXT: retl ## encoding: [0xc3] 1658; 1659; X86-AVX1-LABEL: insertps_from_broadcast_multiple_use: 1660; X86-AVX1: ## %bb.0: 1661; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1662; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1663; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81] 1664; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] 1665; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] 1666; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] 1667; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] 1668; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] 1669; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] 1670; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] 1671; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] 1672; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] 1673; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca] 1674; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] 1675; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1676; 1677; X86-AVX512-LABEL: insertps_from_broadcast_multiple_use: 1678; X86-AVX512: ## %bb.0: 1679; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1680; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1681; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81] 1682; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] 1683; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] 1684; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] 1685; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] 1686; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] 1687; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] 1688; X86-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] 1689; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] 1690; X86-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] 1691; X86-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] 1692; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] 1693; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1694; 1695; X64-SSE-LABEL: insertps_from_broadcast_multiple_use: 1696; X64-SSE: ## %bb.0: 1697; X64-SSE-NEXT: movss (%rdi,%rsi,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0xb7] 1698; X64-SSE-NEXT: ## xmm4 = mem[0],zero,zero,zero 1699; X64-SSE-NEXT: insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30] 1700; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] 1701; X64-SSE-NEXT: insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30] 1702; X64-SSE-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] 1703; X64-SSE-NEXT: addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1] 1704; X64-SSE-NEXT: insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30] 1705; X64-SSE-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[0] 1706; X64-SSE-NEXT: insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30] 1707; X64-SSE-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[0] 1708; X64-SSE-NEXT: addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda] 1709; X64-SSE-NEXT: addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3] 1710; X64-SSE-NEXT: retq ## encoding: [0xc3] 1711; 1712; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use: 1713; X64-AVX1: ## %bb.0: 1714; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7] 1715; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] 1716; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] 1717; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] 1718; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] 1719; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] 1720; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] 1721; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] 1722; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] 1723; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] 1724; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca] 1725; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] 1726; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1727; 1728; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use: 1729; X64-AVX512: ## %bb.0: 1730; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7] 1731; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] 1732; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] 1733; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] 1734; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] 1735; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] 1736; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] 1737; X64-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] 1738; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] 1739; X64-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] 1740; X64-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] 1741; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] 1742; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1743 %1 = getelementptr inbounds float, float* %fb, i64 %index 1744 %2 = load float, float* %1, align 4 1745 %3 = insertelement <4 x float> undef, float %2, i32 0 1746 %4 = insertelement <4 x float> %3, float %2, i32 1 1747 %5 = insertelement <4 x float> %4, float %2, i32 2 1748 %6 = insertelement <4 x float> %5, float %2, i32 3 1749 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 1750 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) 1751 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) 1752 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) 1753 %11 = fadd <4 x float> %7, %8 1754 %12 = fadd <4 x float> %9, %10 1755 %13 = fadd <4 x float> %11, %12 1756 ret <4 x float> %13 1757} 1758 1759define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { 1760; X86-SSE-LABEL: insertps_with_undefs: 1761; X86-SSE: ## %bb.0: 1762; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1763; X86-SSE-NEXT: movss (%eax), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x08] 1764; X86-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero 1765; X86-SSE-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] 1766; X86-SSE-NEXT: ## xmm1 = xmm1[0],xmm0[0] 1767; X86-SSE-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] 1768; X86-SSE-NEXT: retl ## encoding: [0xc3] 1769; 1770; X86-AVX1-LABEL: insertps_with_undefs: 1771; X86-AVX1: ## %bb.0: 1772; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1773; X86-AVX1-NEXT: vmovss (%eax), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x08] 1774; X86-AVX1-NEXT: ## xmm1 = mem[0],zero,zero,zero 1775; X86-AVX1-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0] 1776; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[0] 1777; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1778; 1779; X86-AVX512-LABEL: insertps_with_undefs: 1780; X86-AVX512: ## %bb.0: 1781; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1782; X86-AVX512-NEXT: vmovss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08] 1783; X86-AVX512-NEXT: ## xmm1 = mem[0],zero,zero,zero 1784; X86-AVX512-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0] 1785; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[0] 1786; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1787; 1788; X64-SSE-LABEL: insertps_with_undefs: 1789; X64-SSE: ## %bb.0: 1790; X64-SSE-NEXT: movss (%rdi), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x0f] 1791; X64-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero 1792; X64-SSE-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] 1793; X64-SSE-NEXT: ## xmm1 = xmm1[0],xmm0[0] 1794; X64-SSE-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] 1795; X64-SSE-NEXT: retq ## encoding: [0xc3] 1796; 1797; X64-AVX1-LABEL: insertps_with_undefs: 1798; X64-AVX1: ## %bb.0: 1799; X64-AVX1-NEXT: vmovss (%rdi), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x0f] 1800; X64-AVX1-NEXT: ## xmm1 = mem[0],zero,zero,zero 1801; X64-AVX1-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0] 1802; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[0] 1803; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1804; 1805; X64-AVX512-LABEL: insertps_with_undefs: 1806; X64-AVX512: ## %bb.0: 1807; X64-AVX512-NEXT: vmovss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f] 1808; X64-AVX512-NEXT: ## xmm1 = mem[0],zero,zero,zero 1809; X64-AVX512-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0] 1810; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[0] 1811; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1812 %1 = load float, float* %b, align 4 1813 %2 = insertelement <4 x float> undef, float %1, i32 0 1814 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7> 1815 ret <4 x float> %result 1816} 1817 1818; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using 1819; the destination index to change the load, instead of the source index. 1820define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { 1821; X86-SSE-LABEL: pr20087: 1822; X86-SSE: ## %bb.0: 1823; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1824; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] 1825; X86-SSE-NEXT: insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2] 1826; X86-SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1827; X86-SSE-NEXT: retl ## encoding: [0xc3] 1828; 1829; X86-AVX1-LABEL: pr20087: 1830; X86-AVX1: ## %bb.0: 1831; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1832; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] 1833; X86-AVX1-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] 1834; X86-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1835; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1836; 1837; X86-AVX512-LABEL: pr20087: 1838; X86-AVX512: ## %bb.0: 1839; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1840; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] 1841; X86-AVX512-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] 1842; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1843; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1844; 1845; X64-SSE-LABEL: pr20087: 1846; X64-SSE: ## %bb.0: 1847; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] 1848; X64-SSE-NEXT: insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2] 1849; X64-SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1850; X64-SSE-NEXT: retq ## encoding: [0xc3] 1851; 1852; X64-AVX1-LABEL: pr20087: 1853; X64-AVX1: ## %bb.0: 1854; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] 1855; X64-AVX1-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] 1856; X64-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1857; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1858; 1859; X64-AVX512-LABEL: pr20087: 1860; X64-AVX512: ## %bb.0: 1861; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] 1862; X64-AVX512-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] 1863; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1864; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1865 %load = load <4 x float> , <4 x float> *%ptr 1866 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> 1867 ret <4 x float> %ret 1868} 1869 1870; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> 1871define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 { 1872; X86-SSE-LABEL: insertps_pr20411: 1873; X86-SSE: ## %bb.0: 1874; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1875; X86-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] 1876; X86-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] 1877; X86-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3] 1878; X86-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1879; X86-SSE-NEXT: movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08] 1880; X86-SSE-NEXT: retl ## encoding: [0xc3] 1881; 1882; X86-AVX1-LABEL: insertps_pr20411: 1883; X86-AVX1: ## %bb.0: 1884; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1885; X86-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] 1886; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] 1887; X86-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 1888; X86-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1889; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00] 1890; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1891; 1892; X86-AVX512-LABEL: insertps_pr20411: 1893; X86-AVX512: ## %bb.0: 1894; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1895; X86-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] 1896; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] 1897; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 1898; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1899; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] 1900; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1901; 1902; X64-SSE-LABEL: insertps_pr20411: 1903; X64-SSE: ## %bb.0: 1904; X64-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] 1905; X64-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] 1906; X64-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3] 1907; X64-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1908; X64-SSE-NEXT: movdqu %xmm1, (%rdi) ## encoding: [0xf3,0x0f,0x7f,0x0f] 1909; X64-SSE-NEXT: retq ## encoding: [0xc3] 1910; 1911; X64-AVX1-LABEL: insertps_pr20411: 1912; X64-AVX1: ## %bb.0: 1913; X64-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] 1914; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] 1915; X64-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 1916; X64-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1917; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07] 1918; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1919; 1920; X64-AVX512-LABEL: insertps_pr20411: 1921; X64-AVX512: ## %bb.0: 1922; X64-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] 1923; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] 1924; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 1925; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1926; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] 1927; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1928 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef> 1929 %ptrcast = bitcast i32* %RET to <4 x i32>* 1930 store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 1931 ret void 1932} 1933 1934define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { 1935; SSE-LABEL: insertps_4: 1936; SSE: ## %bb.0: 1937; SSE-NEXT: insertps $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xaa] 1938; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm1[2],zero 1939; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1940; 1941; AVX1-LABEL: insertps_4: 1942; AVX1: ## %bb.0: 1943; AVX1-NEXT: vinsertps $170, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa] 1944; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm1[2],zero 1945; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1946; 1947; AVX512-LABEL: insertps_4: 1948; AVX512: ## %bb.0: 1949; AVX512-NEXT: vinsertps $170, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa] 1950; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm1[2],zero 1951; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1952 %vecext = extractelement <4 x float> %A, i32 0 1953 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1954 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1955 %vecext2 = extractelement <4 x float> %B, i32 2 1956 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1957 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1958 ret <4 x float> %vecinit4 1959} 1960 1961define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { 1962; SSE-LABEL: insertps_5: 1963; SSE: ## %bb.0: 1964; SSE-NEXT: insertps $92, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x5c] 1965; SSE-NEXT: ## xmm0 = xmm0[0],xmm1[1],zero,zero 1966; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1967; 1968; AVX1-LABEL: insertps_5: 1969; AVX1: ## %bb.0: 1970; AVX1-NEXT: vinsertps $92, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c] 1971; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],zero,zero 1972; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1973; 1974; AVX512-LABEL: insertps_5: 1975; AVX512: ## %bb.0: 1976; AVX512-NEXT: vinsertps $92, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c] 1977; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],zero,zero 1978; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1979 %vecext = extractelement <4 x float> %A, i32 0 1980 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1981 %vecext1 = extractelement <4 x float> %B, i32 1 1982 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1983 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1984 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1985 ret <4 x float> %vecinit4 1986} 1987 1988define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { 1989; SSE-LABEL: insertps_6: 1990; SSE: ## %bb.0: 1991; SSE-NEXT: insertps $169, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xa9] 1992; SSE-NEXT: ## xmm0 = zero,xmm0[1],xmm1[2],zero 1993; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1994; 1995; AVX1-LABEL: insertps_6: 1996; AVX1: ## %bb.0: 1997; AVX1-NEXT: vinsertps $169, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9] 1998; AVX1-NEXT: ## xmm0 = zero,xmm0[1],xmm1[2],zero 1999; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2000; 2001; AVX512-LABEL: insertps_6: 2002; AVX512: ## %bb.0: 2003; AVX512-NEXT: vinsertps $169, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9] 2004; AVX512-NEXT: ## xmm0 = zero,xmm0[1],xmm1[2],zero 2005; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2006 %vecext = extractelement <4 x float> %A, i32 1 2007 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 2008 %vecext1 = extractelement <4 x float> %B, i32 2 2009 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 2010 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 2011 ret <4 x float> %vecinit3 2012} 2013 2014define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { 2015; SSE-LABEL: insertps_7: 2016; SSE: ## %bb.0: 2017; SSE-NEXT: insertps $106, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x6a] 2018; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm1[1],zero 2019; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2020; 2021; AVX1-LABEL: insertps_7: 2022; AVX1: ## %bb.0: 2023; AVX1-NEXT: vinsertps $106, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a] 2024; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm1[1],zero 2025; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2026; 2027; AVX512-LABEL: insertps_7: 2028; AVX512: ## %bb.0: 2029; AVX512-NEXT: vinsertps $106, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a] 2030; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm1[1],zero 2031; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2032 %vecext = extractelement <4 x float> %A, i32 0 2033 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 2034 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 2035 %vecext2 = extractelement <4 x float> %B, i32 1 2036 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 2037 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 2038 ret <4 x float> %vecinit4 2039} 2040 2041define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { 2042; SSE-LABEL: insertps_8: 2043; SSE: ## %bb.0: 2044; SSE-NEXT: insertps $28, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x1c] 2045; SSE-NEXT: ## xmm0 = xmm0[0],xmm1[0],zero,zero 2046; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2047; 2048; AVX1-LABEL: insertps_8: 2049; AVX1: ## %bb.0: 2050; AVX1-NEXT: vinsertps $28, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c] 2051; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[0],zero,zero 2052; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2053; 2054; AVX512-LABEL: insertps_8: 2055; AVX512: ## %bb.0: 2056; AVX512-NEXT: vinsertps $28, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c] 2057; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[0],zero,zero 2058; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2059 %vecext = extractelement <4 x float> %A, i32 0 2060 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 2061 %vecext1 = extractelement <4 x float> %B, i32 0 2062 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 2063 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 2064 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 2065 ret <4 x float> %vecinit4 2066} 2067 2068define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { 2069; SSE-LABEL: insertps_9: 2070; SSE: ## %bb.0: 2071; SSE-NEXT: insertps $25, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xc8,0x19] 2072; SSE-NEXT: ## xmm1 = zero,xmm0[0],xmm1[2],zero 2073; SSE-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] 2074; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2075; 2076; AVX1-LABEL: insertps_9: 2077; AVX1: ## %bb.0: 2078; AVX1-NEXT: vinsertps $25, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19] 2079; AVX1-NEXT: ## xmm0 = zero,xmm0[0],xmm1[2],zero 2080; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2081; 2082; AVX512-LABEL: insertps_9: 2083; AVX512: ## %bb.0: 2084; AVX512-NEXT: vinsertps $25, %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19] 2085; AVX512-NEXT: ## xmm0 = zero,xmm0[0],xmm1[2],zero 2086; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2087 %vecext = extractelement <4 x float> %A, i32 0 2088 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 2089 %vecext1 = extractelement <4 x float> %B, i32 2 2090 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 2091 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 2092 ret <4 x float> %vecinit3 2093} 2094 2095define <4 x float> @insertps_10(<4 x float> %A) { 2096; SSE-LABEL: insertps_10: 2097; SSE: ## %bb.0: 2098; SSE-NEXT: insertps $42, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x2a] 2099; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[0],zero 2100; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2101; 2102; AVX1-LABEL: insertps_10: 2103; AVX1: ## %bb.0: 2104; AVX1-NEXT: vinsertps $42, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a] 2105; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[0],zero 2106; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2107; 2108; AVX512-LABEL: insertps_10: 2109; AVX512: ## %bb.0: 2110; AVX512-NEXT: vinsertps $42, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a] 2111; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[0],zero 2112; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2113 %vecext = extractelement <4 x float> %A, i32 0 2114 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0 2115 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2 2116 ret <4 x float> %vecbuild2 2117} 2118 2119define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { 2120; SSE-LABEL: build_vector_to_shuffle_1: 2121; SSE: ## %bb.0: 2122; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 2123; SSE-NEXT: blendps $5, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x05] 2124; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2125; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2126; 2127; AVX1-LABEL: build_vector_to_shuffle_1: 2128; AVX1: ## %bb.0: 2129; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 2130; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] 2131; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2132; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2133; 2134; AVX512-LABEL: build_vector_to_shuffle_1: 2135; AVX512: ## %bb.0: 2136; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 2137; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] 2138; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2139; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2140 %vecext = extractelement <4 x float> %A, i32 1 2141 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 2142 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 2143 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 2144 ret <4 x float> %vecinit3 2145} 2146 2147define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { 2148; SSE-LABEL: build_vector_to_shuffle_2: 2149; SSE: ## %bb.0: 2150; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 2151; SSE-NEXT: blendps $13, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x0d] 2152; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 2153; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2154; 2155; AVX1-LABEL: build_vector_to_shuffle_2: 2156; AVX1: ## %bb.0: 2157; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 2158; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] 2159; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 2160; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2161; 2162; AVX512-LABEL: build_vector_to_shuffle_2: 2163; AVX512: ## %bb.0: 2164; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 2165; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] 2166; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 2167; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2168 %vecext = extractelement <4 x float> %A, i32 1 2169 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 2170 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 2171 ret <4 x float> %vecinit1 2172} 2173