1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 14 15; Just one 32-bit run to make sure we do reasonable things for i64 cases. 16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2 17 18declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 19declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 20declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) 21declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) 22 23; 24; Variable Shifts 25; 26 27define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { 28; SSE2-LABEL: var_funnnel_v2i64: 29; SSE2: # %bb.0: 30; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,63] 31; SSE2-NEXT: pxor %xmm3, %xmm3 32; SSE2-NEXT: psubq %xmm1, %xmm3 33; SSE2-NEXT: pand %xmm2, %xmm1 34; SSE2-NEXT: movdqa %xmm0, %xmm4 35; SSE2-NEXT: psrlq %xmm1, %xmm4 36; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 37; SSE2-NEXT: movdqa %xmm0, %xmm5 38; SSE2-NEXT: psrlq %xmm1, %xmm5 39; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] 40; SSE2-NEXT: pand %xmm2, %xmm3 41; SSE2-NEXT: movdqa %xmm0, %xmm1 42; SSE2-NEXT: psllq %xmm3, %xmm1 43; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 44; SSE2-NEXT: psllq %xmm2, %xmm0 45; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 46; SSE2-NEXT: orpd %xmm5, %xmm0 47; SSE2-NEXT: retq 48; 49; SSE41-LABEL: var_funnnel_v2i64: 50; SSE41: # %bb.0: 51; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [63,63] 52; SSE41-NEXT: pxor %xmm3, %xmm3 53; SSE41-NEXT: psubq %xmm1, %xmm3 54; SSE41-NEXT: pand %xmm2, %xmm1 55; SSE41-NEXT: movdqa %xmm0, %xmm4 56; SSE41-NEXT: psrlq %xmm1, %xmm4 57; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 58; SSE41-NEXT: movdqa %xmm0, %xmm5 59; SSE41-NEXT: psrlq %xmm1, %xmm5 60; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] 61; SSE41-NEXT: pand %xmm2, %xmm3 62; SSE41-NEXT: movdqa %xmm0, %xmm1 63; SSE41-NEXT: psllq %xmm3, %xmm1 64; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 65; SSE41-NEXT: psllq %xmm2, %xmm0 66; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 67; SSE41-NEXT: por %xmm5, %xmm0 68; SSE41-NEXT: retq 69; 70; AVX1-LABEL: var_funnnel_v2i64: 71; AVX1: # %bb.0: 72; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] 73; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 74; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 75; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 76; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 77; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 78; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 79; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 80; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 81; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 82; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 83; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 84; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 85; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 86; AVX1-NEXT: retq 87; 88; AVX2-LABEL: var_funnnel_v2i64: 89; AVX2: # %bb.0: 90; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] 91; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 92; AVX2-NEXT: vpsrlvq %xmm3, %xmm0, %xmm3 93; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 94; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 95; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 96; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 97; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 98; AVX2-NEXT: retq 99; 100; AVX512F-LABEL: var_funnnel_v2i64: 101; AVX512F: # %bb.0: 102; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 103; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 104; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0 105; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 106; AVX512F-NEXT: vzeroupper 107; AVX512F-NEXT: retq 108; 109; AVX512VL-LABEL: var_funnnel_v2i64: 110; AVX512VL: # %bb.0: 111; AVX512VL-NEXT: vprorvq %xmm1, %xmm0, %xmm0 112; AVX512VL-NEXT: retq 113; 114; AVX512BW-LABEL: var_funnnel_v2i64: 115; AVX512BW: # %bb.0: 116; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 117; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 118; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0 119; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 120; AVX512BW-NEXT: vzeroupper 121; AVX512BW-NEXT: retq 122; 123; AVX512VLBW-LABEL: var_funnnel_v2i64: 124; AVX512VLBW: # %bb.0: 125; AVX512VLBW-NEXT: vprorvq %xmm1, %xmm0, %xmm0 126; AVX512VLBW-NEXT: retq 127; 128; AVX512VBMI2-LABEL: var_funnnel_v2i64: 129; AVX512VBMI2: # %bb.0: 130; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 131; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 132; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 133; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 134; AVX512VBMI2-NEXT: vzeroupper 135; AVX512VBMI2-NEXT: retq 136; 137; AVX512VLVBMI2-LABEL: var_funnnel_v2i64: 138; AVX512VLVBMI2: # %bb.0: 139; AVX512VLVBMI2-NEXT: vprorvq %xmm1, %xmm0, %xmm0 140; AVX512VLVBMI2-NEXT: retq 141; 142; XOP-LABEL: var_funnnel_v2i64: 143; XOP: # %bb.0: 144; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 145; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 146; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 147; XOP-NEXT: retq 148; 149; X86-SSE2-LABEL: var_funnnel_v2i64: 150; X86-SSE2: # %bb.0: 151; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0] 152; X86-SSE2-NEXT: pxor %xmm3, %xmm3 153; X86-SSE2-NEXT: psubq %xmm1, %xmm3 154; X86-SSE2-NEXT: pand %xmm2, %xmm1 155; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 156; X86-SSE2-NEXT: psrlq %xmm1, %xmm4 157; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 158; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 159; X86-SSE2-NEXT: psrlq %xmm1, %xmm5 160; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] 161; X86-SSE2-NEXT: pand %xmm2, %xmm3 162; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 163; X86-SSE2-NEXT: psllq %xmm3, %xmm1 164; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 165; X86-SSE2-NEXT: psllq %xmm2, %xmm0 166; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 167; X86-SSE2-NEXT: orpd %xmm5, %xmm0 168; X86-SSE2-NEXT: retl 169 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt) 170 ret <2 x i64> %res 171} 172 173define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind { 174; SSE2-LABEL: var_funnnel_v4i32: 175; SSE2: # %bb.0: 176; SSE2-NEXT: pxor %xmm2, %xmm2 177; SSE2-NEXT: psubd %xmm1, %xmm2 178; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 179; SSE2-NEXT: pslld $23, %xmm2 180; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2 181; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 182; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 183; SSE2-NEXT: pmuludq %xmm1, %xmm0 184; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 185; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 186; SSE2-NEXT: pmuludq %xmm2, %xmm1 187; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 188; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 189; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 190; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 191; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 192; SSE2-NEXT: por %xmm3, %xmm0 193; SSE2-NEXT: retq 194; 195; SSE41-LABEL: var_funnnel_v4i32: 196; SSE41: # %bb.0: 197; SSE41-NEXT: pxor %xmm2, %xmm2 198; SSE41-NEXT: psubd %xmm1, %xmm2 199; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 200; SSE41-NEXT: pslld $23, %xmm2 201; SSE41-NEXT: paddd {{.*}}(%rip), %xmm2 202; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 203; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 204; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 205; SSE41-NEXT: pmuludq %xmm2, %xmm3 206; SSE41-NEXT: pmuludq %xmm1, %xmm0 207; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 208; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 209; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 210; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 211; SSE41-NEXT: por %xmm1, %xmm0 212; SSE41-NEXT: retq 213; 214; AVX1-LABEL: var_funnnel_v4i32: 215; AVX1: # %bb.0: 216; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 217; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 218; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 219; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 220; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 221; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 222; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 223; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 224; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 225; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 226; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 227; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 228; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 229; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 230; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 231; AVX1-NEXT: retq 232; 233; AVX2-LABEL: var_funnnel_v4i32: 234; AVX2: # %bb.0: 235; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 236; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 237; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 238; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 239; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 240; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 241; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 242; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 243; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 244; AVX2-NEXT: retq 245; 246; AVX512F-LABEL: var_funnnel_v4i32: 247; AVX512F: # %bb.0: 248; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 249; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 250; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 251; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 252; AVX512F-NEXT: vzeroupper 253; AVX512F-NEXT: retq 254; 255; AVX512VL-LABEL: var_funnnel_v4i32: 256; AVX512VL: # %bb.0: 257; AVX512VL-NEXT: vprorvd %xmm1, %xmm0, %xmm0 258; AVX512VL-NEXT: retq 259; 260; AVX512BW-LABEL: var_funnnel_v4i32: 261; AVX512BW: # %bb.0: 262; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 263; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 264; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 265; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 266; AVX512BW-NEXT: vzeroupper 267; AVX512BW-NEXT: retq 268; 269; AVX512VLBW-LABEL: var_funnnel_v4i32: 270; AVX512VLBW: # %bb.0: 271; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0 272; AVX512VLBW-NEXT: retq 273; 274; AVX512VBMI2-LABEL: var_funnnel_v4i32: 275; AVX512VBMI2: # %bb.0: 276; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 277; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 278; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 279; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 280; AVX512VBMI2-NEXT: vzeroupper 281; AVX512VBMI2-NEXT: retq 282; 283; AVX512VLVBMI2-LABEL: var_funnnel_v4i32: 284; AVX512VLVBMI2: # %bb.0: 285; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0 286; AVX512VLVBMI2-NEXT: retq 287; 288; XOP-LABEL: var_funnnel_v4i32: 289; XOP: # %bb.0: 290; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 291; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 292; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 293; XOP-NEXT: retq 294; 295; X86-SSE2-LABEL: var_funnnel_v4i32: 296; X86-SSE2: # %bb.0: 297; X86-SSE2-NEXT: pxor %xmm2, %xmm2 298; X86-SSE2-NEXT: psubd %xmm1, %xmm2 299; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 300; X86-SSE2-NEXT: pslld $23, %xmm2 301; X86-SSE2-NEXT: paddd {{\.LCPI.*}}, %xmm2 302; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 303; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 304; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 305; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 306; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 307; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 308; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 309; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 310; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 311; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 312; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 313; X86-SSE2-NEXT: por %xmm3, %xmm0 314; X86-SSE2-NEXT: retl 315 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %amt) 316 ret <4 x i32> %res 317} 318 319define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { 320; SSE2-LABEL: var_funnnel_v8i16: 321; SSE2: # %bb.0: 322; SSE2-NEXT: pxor %xmm2, %xmm2 323; SSE2-NEXT: psubw %xmm1, %xmm2 324; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 325; SSE2-NEXT: movdqa %xmm2, %xmm1 326; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 327; SSE2-NEXT: pslld $23, %xmm1 328; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 329; SSE2-NEXT: paddd %xmm3, %xmm1 330; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 331; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 332; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 333; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 334; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 335; SSE2-NEXT: pslld $23, %xmm2 336; SSE2-NEXT: paddd %xmm3, %xmm2 337; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 338; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 339; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 340; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 341; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 342; SSE2-NEXT: movdqa %xmm0, %xmm1 343; SSE2-NEXT: pmulhuw %xmm2, %xmm1 344; SSE2-NEXT: pmullw %xmm2, %xmm0 345; SSE2-NEXT: por %xmm1, %xmm0 346; SSE2-NEXT: retq 347; 348; SSE41-LABEL: var_funnnel_v8i16: 349; SSE41: # %bb.0: 350; SSE41-NEXT: pxor %xmm2, %xmm2 351; SSE41-NEXT: psubw %xmm1, %xmm2 352; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 353; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 354; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 355; SSE41-NEXT: pslld $23, %xmm2 356; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 357; SSE41-NEXT: paddd %xmm3, %xmm2 358; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 359; SSE41-NEXT: pslld $23, %xmm1 360; SSE41-NEXT: paddd %xmm3, %xmm1 361; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 362; SSE41-NEXT: packusdw %xmm2, %xmm1 363; SSE41-NEXT: movdqa %xmm0, %xmm2 364; SSE41-NEXT: pmulhuw %xmm1, %xmm2 365; SSE41-NEXT: pmullw %xmm1, %xmm0 366; SSE41-NEXT: por %xmm2, %xmm0 367; SSE41-NEXT: retq 368; 369; AVX1-LABEL: var_funnnel_v8i16: 370; AVX1: # %bb.0: 371; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 372; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 373; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 374; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] 375; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 376; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 377; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 378; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 379; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 380; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 381; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 382; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 383; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 384; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 385; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 386; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 387; AVX1-NEXT: retq 388; 389; AVX2-LABEL: var_funnnel_v8i16: 390; AVX2: # %bb.0: 391; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 392; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 393; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 394; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 395; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 396; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 397; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 398; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 399; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 400; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 401; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 402; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 403; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 404; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 405; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 406; AVX2-NEXT: vzeroupper 407; AVX2-NEXT: retq 408; 409; AVX512F-LABEL: var_funnnel_v8i16: 410; AVX512F: # %bb.0: 411; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 412; AVX512F-NEXT: vpsubw %xmm1, %xmm2, %xmm1 413; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 414; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 415; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 416; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 417; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 418; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 419; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 420; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 421; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 422; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 423; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 424; AVX512F-NEXT: vzeroupper 425; AVX512F-NEXT: retq 426; 427; AVX512VL-LABEL: var_funnnel_v8i16: 428; AVX512VL: # %bb.0: 429; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 430; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm1 431; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 432; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 433; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 434; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 435; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 436; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 437; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 438; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 439; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 440; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 441; AVX512VL-NEXT: vzeroupper 442; AVX512VL-NEXT: retq 443; 444; AVX512BW-LABEL: var_funnnel_v8i16: 445; AVX512BW: # %bb.0: 446; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 447; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 448; AVX512BW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 449; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 450; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 451; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 452; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 453; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 454; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 455; AVX512BW-NEXT: vzeroupper 456; AVX512BW-NEXT: retq 457; 458; AVX512VLBW-LABEL: var_funnnel_v8i16: 459; AVX512VLBW: # %bb.0: 460; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 461; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 462; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 463; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 464; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 465; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 466; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 467; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 468; AVX512VLBW-NEXT: retq 469; 470; AVX512VBMI2-LABEL: var_funnnel_v8i16: 471; AVX512VBMI2: # %bb.0: 472; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 473; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 474; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 475; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 476; AVX512VBMI2-NEXT: vzeroupper 477; AVX512VBMI2-NEXT: retq 478; 479; AVX512VLVBMI2-LABEL: var_funnnel_v8i16: 480; AVX512VLVBMI2: # %bb.0: 481; AVX512VLVBMI2-NEXT: vpshrdvw %xmm1, %xmm0, %xmm0 482; AVX512VLVBMI2-NEXT: retq 483; 484; XOP-LABEL: var_funnnel_v8i16: 485; XOP: # %bb.0: 486; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 487; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 488; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 489; XOP-NEXT: retq 490; 491; X86-SSE2-LABEL: var_funnnel_v8i16: 492; X86-SSE2: # %bb.0: 493; X86-SSE2-NEXT: pxor %xmm2, %xmm2 494; X86-SSE2-NEXT: psubw %xmm1, %xmm2 495; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 496; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 497; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 498; X86-SSE2-NEXT: pslld $23, %xmm1 499; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 500; X86-SSE2-NEXT: paddd %xmm3, %xmm1 501; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 502; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 503; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 504; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 505; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 506; X86-SSE2-NEXT: pslld $23, %xmm2 507; X86-SSE2-NEXT: paddd %xmm3, %xmm2 508; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 509; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 510; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 511; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 512; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 513; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 514; X86-SSE2-NEXT: pmulhuw %xmm2, %xmm1 515; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 516; X86-SSE2-NEXT: por %xmm1, %xmm0 517; X86-SSE2-NEXT: retl 518 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %amt) 519 ret <8 x i16> %res 520} 521 522define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { 523; SSE2-LABEL: var_funnnel_v16i8: 524; SSE2: # %bb.0: 525; SSE2-NEXT: movdqa %xmm0, %xmm2 526; SSE2-NEXT: pxor %xmm0, %xmm0 527; SSE2-NEXT: pxor %xmm3, %xmm3 528; SSE2-NEXT: psubb %xmm1, %xmm3 529; SSE2-NEXT: psllw $5, %xmm3 530; SSE2-NEXT: pxor %xmm1, %xmm1 531; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 532; SSE2-NEXT: movdqa %xmm2, %xmm4 533; SSE2-NEXT: psrlw $4, %xmm4 534; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 535; SSE2-NEXT: movdqa %xmm2, %xmm5 536; SSE2-NEXT: psllw $4, %xmm5 537; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 538; SSE2-NEXT: por %xmm4, %xmm5 539; SSE2-NEXT: pand %xmm1, %xmm5 540; SSE2-NEXT: pandn %xmm2, %xmm1 541; SSE2-NEXT: por %xmm5, %xmm1 542; SSE2-NEXT: movdqa %xmm1, %xmm2 543; SSE2-NEXT: psrlw $6, %xmm2 544; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 545; SSE2-NEXT: movdqa %xmm1, %xmm4 546; SSE2-NEXT: psllw $2, %xmm4 547; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 548; SSE2-NEXT: por %xmm2, %xmm4 549; SSE2-NEXT: paddb %xmm3, %xmm3 550; SSE2-NEXT: pxor %xmm2, %xmm2 551; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 552; SSE2-NEXT: pand %xmm2, %xmm4 553; SSE2-NEXT: pandn %xmm1, %xmm2 554; SSE2-NEXT: por %xmm4, %xmm2 555; SSE2-NEXT: movdqa %xmm2, %xmm1 556; SSE2-NEXT: paddb %xmm2, %xmm1 557; SSE2-NEXT: movdqa %xmm2, %xmm4 558; SSE2-NEXT: psrlw $7, %xmm4 559; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 560; SSE2-NEXT: por %xmm1, %xmm4 561; SSE2-NEXT: paddb %xmm3, %xmm3 562; SSE2-NEXT: pcmpgtb %xmm3, %xmm0 563; SSE2-NEXT: pand %xmm0, %xmm4 564; SSE2-NEXT: pandn %xmm2, %xmm0 565; SSE2-NEXT: por %xmm4, %xmm0 566; SSE2-NEXT: retq 567; 568; SSE41-LABEL: var_funnnel_v16i8: 569; SSE41: # %bb.0: 570; SSE41-NEXT: movdqa %xmm0, %xmm2 571; SSE41-NEXT: psrlw $4, %xmm0 572; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 573; SSE41-NEXT: movdqa %xmm2, %xmm3 574; SSE41-NEXT: psllw $4, %xmm3 575; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 576; SSE41-NEXT: por %xmm0, %xmm3 577; SSE41-NEXT: pxor %xmm0, %xmm0 578; SSE41-NEXT: psubb %xmm1, %xmm0 579; SSE41-NEXT: psllw $5, %xmm0 580; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 581; SSE41-NEXT: movdqa %xmm2, %xmm1 582; SSE41-NEXT: psrlw $6, %xmm1 583; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 584; SSE41-NEXT: movdqa %xmm2, %xmm3 585; SSE41-NEXT: psllw $2, %xmm3 586; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 587; SSE41-NEXT: por %xmm1, %xmm3 588; SSE41-NEXT: paddb %xmm0, %xmm0 589; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 590; SSE41-NEXT: movdqa %xmm2, %xmm1 591; SSE41-NEXT: paddb %xmm2, %xmm1 592; SSE41-NEXT: movdqa %xmm2, %xmm3 593; SSE41-NEXT: psrlw $7, %xmm3 594; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 595; SSE41-NEXT: por %xmm1, %xmm3 596; SSE41-NEXT: paddb %xmm0, %xmm0 597; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 598; SSE41-NEXT: movdqa %xmm2, %xmm0 599; SSE41-NEXT: retq 600; 601; AVX-LABEL: var_funnnel_v16i8: 602; AVX: # %bb.0: 603; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 604; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 605; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 606; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 607; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 608; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 609; AVX-NEXT: vpsubb %xmm1, %xmm3, %xmm1 610; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 611; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 612; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 613; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 614; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 615; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 616; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 617; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 618; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 619; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 620; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 621; AVX-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 622; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 623; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 624; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 625; AVX-NEXT: retq 626; 627; AVX512F-LABEL: var_funnnel_v16i8: 628; AVX512F: # %bb.0: 629; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 630; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 631; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero 632; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 633; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3 634; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 635; AVX512F-NEXT: vpsubb %xmm1, %xmm4, %xmm1 636; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 637; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 638; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 639; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 640; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 641; AVX512F-NEXT: vzeroupper 642; AVX512F-NEXT: retq 643; 644; AVX512VL-LABEL: var_funnnel_v16i8: 645; AVX512VL: # %bb.0: 646; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 647; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 648; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero 649; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 650; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3 651; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 652; AVX512VL-NEXT: vpsubb %xmm1, %xmm4, %xmm1 653; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 654; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 655; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 656; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 657; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 658; AVX512VL-NEXT: vzeroupper 659; AVX512VL-NEXT: retq 660; 661; AVX512BW-LABEL: var_funnnel_v16i8: 662; AVX512BW: # %bb.0: 663; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 664; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 665; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 666; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 667; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 668; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 669; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 670; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 671; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 672; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 673; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0 674; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 675; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 676; AVX512BW-NEXT: vzeroupper 677; AVX512BW-NEXT: retq 678; 679; AVX512VLBW-LABEL: var_funnnel_v16i8: 680; AVX512VLBW: # %bb.0: 681; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 682; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 683; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 684; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 685; AVX512VLBW-NEXT: vpsrlvw %ymm3, %ymm0, %ymm3 686; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 687; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 688; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 689; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 690; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 691; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0 692; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 693; AVX512VLBW-NEXT: vzeroupper 694; AVX512VLBW-NEXT: retq 695; 696; AVX512VBMI2-LABEL: var_funnnel_v16i8: 697; AVX512VBMI2: # %bb.0: 698; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 699; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 700; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 701; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 702; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 703; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 704; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 705; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 706; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 707; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 708; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 709; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 710; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 711; AVX512VBMI2-NEXT: vzeroupper 712; AVX512VBMI2-NEXT: retq 713; 714; AVX512VLVBMI2-LABEL: var_funnnel_v16i8: 715; AVX512VLVBMI2: # %bb.0: 716; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 717; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 718; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 719; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 720; AVX512VLVBMI2-NEXT: vpsrlvw %ymm3, %ymm0, %ymm3 721; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 722; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 723; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 724; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 725; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 726; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 727; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 728; AVX512VLVBMI2-NEXT: vzeroupper 729; AVX512VLVBMI2-NEXT: retq 730; 731; XOP-LABEL: var_funnnel_v16i8: 732; XOP: # %bb.0: 733; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 734; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 735; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 736; XOP-NEXT: retq 737; 738; X86-SSE2-LABEL: var_funnnel_v16i8: 739; X86-SSE2: # %bb.0: 740; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 741; X86-SSE2-NEXT: pxor %xmm0, %xmm0 742; X86-SSE2-NEXT: pxor %xmm3, %xmm3 743; X86-SSE2-NEXT: psubb %xmm1, %xmm3 744; X86-SSE2-NEXT: psllw $5, %xmm3 745; X86-SSE2-NEXT: pxor %xmm1, %xmm1 746; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm1 747; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 748; X86-SSE2-NEXT: psrlw $4, %xmm4 749; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm4 750; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 751; X86-SSE2-NEXT: psllw $4, %xmm5 752; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm5 753; X86-SSE2-NEXT: por %xmm4, %xmm5 754; X86-SSE2-NEXT: pand %xmm1, %xmm5 755; X86-SSE2-NEXT: pandn %xmm2, %xmm1 756; X86-SSE2-NEXT: por %xmm5, %xmm1 757; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 758; X86-SSE2-NEXT: psrlw $6, %xmm2 759; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 760; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 761; X86-SSE2-NEXT: psllw $2, %xmm4 762; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm4 763; X86-SSE2-NEXT: por %xmm2, %xmm4 764; X86-SSE2-NEXT: paddb %xmm3, %xmm3 765; X86-SSE2-NEXT: pxor %xmm2, %xmm2 766; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm2 767; X86-SSE2-NEXT: pand %xmm2, %xmm4 768; X86-SSE2-NEXT: pandn %xmm1, %xmm2 769; X86-SSE2-NEXT: por %xmm4, %xmm2 770; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 771; X86-SSE2-NEXT: paddb %xmm2, %xmm1 772; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 773; X86-SSE2-NEXT: psrlw $7, %xmm4 774; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm4 775; X86-SSE2-NEXT: por %xmm1, %xmm4 776; X86-SSE2-NEXT: paddb %xmm3, %xmm3 777; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm0 778; X86-SSE2-NEXT: pand %xmm0, %xmm4 779; X86-SSE2-NEXT: pandn %xmm2, %xmm0 780; X86-SSE2-NEXT: por %xmm4, %xmm0 781; X86-SSE2-NEXT: retl 782 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %amt) 783 ret <16 x i8> %res 784} 785 786; 787; Uniform Variable Shifts 788; 789 790define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { 791; SSE-LABEL: splatvar_funnnel_v2i64: 792; SSE: # %bb.0: 793; SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63] 794; SSE-NEXT: pxor %xmm3, %xmm3 795; SSE-NEXT: psubq %xmm1, %xmm3 796; SSE-NEXT: pand %xmm2, %xmm1 797; SSE-NEXT: movdqa %xmm0, %xmm4 798; SSE-NEXT: psrlq %xmm1, %xmm4 799; SSE-NEXT: pand %xmm2, %xmm3 800; SSE-NEXT: psllq %xmm3, %xmm0 801; SSE-NEXT: por %xmm4, %xmm0 802; SSE-NEXT: retq 803; 804; AVX-LABEL: splatvar_funnnel_v2i64: 805; AVX: # %bb.0: 806; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] 807; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 808; AVX-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 809; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 810; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 811; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 812; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 813; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 814; AVX-NEXT: retq 815; 816; AVX512F-LABEL: splatvar_funnnel_v2i64: 817; AVX512F: # %bb.0: 818; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 819; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 820; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0 821; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 822; AVX512F-NEXT: vzeroupper 823; AVX512F-NEXT: retq 824; 825; AVX512VL-LABEL: splatvar_funnnel_v2i64: 826; AVX512VL: # %bb.0: 827; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 828; AVX512VL-NEXT: vprorvq %xmm1, %xmm0, %xmm0 829; AVX512VL-NEXT: retq 830; 831; AVX512BW-LABEL: splatvar_funnnel_v2i64: 832; AVX512BW: # %bb.0: 833; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 834; AVX512BW-NEXT: vpbroadcastq %xmm1, %xmm1 835; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0 836; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 837; AVX512BW-NEXT: vzeroupper 838; AVX512BW-NEXT: retq 839; 840; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: 841; AVX512VLBW: # %bb.0: 842; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %xmm1 843; AVX512VLBW-NEXT: vprorvq %xmm1, %xmm0, %xmm0 844; AVX512VLBW-NEXT: retq 845; 846; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: 847; AVX512VBMI2: # %bb.0: 848; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 849; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 850; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 851; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 852; AVX512VBMI2-NEXT: vzeroupper 853; AVX512VBMI2-NEXT: retq 854; 855; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64: 856; AVX512VLVBMI2: # %bb.0: 857; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 858; AVX512VLVBMI2-NEXT: vprorvq %xmm1, %xmm0, %xmm0 859; AVX512VLVBMI2-NEXT: retq 860; 861; XOPAVX1-LABEL: splatvar_funnnel_v2i64: 862; XOPAVX1: # %bb.0: 863; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 864; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 865; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 866; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 867; XOPAVX1-NEXT: retq 868; 869; XOPAVX2-LABEL: splatvar_funnnel_v2i64: 870; XOPAVX2: # %bb.0: 871; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 872; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 873; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 874; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 875; XOPAVX2-NEXT: retq 876; 877; X86-SSE2-LABEL: splatvar_funnnel_v2i64: 878; X86-SSE2: # %bb.0: 879; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 880; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0] 881; X86-SSE2-NEXT: pxor %xmm3, %xmm3 882; X86-SSE2-NEXT: psubq %xmm1, %xmm3 883; X86-SSE2-NEXT: pand %xmm2, %xmm1 884; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 885; X86-SSE2-NEXT: psrlq %xmm1, %xmm4 886; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 887; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 888; X86-SSE2-NEXT: psrlq %xmm1, %xmm5 889; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] 890; X86-SSE2-NEXT: pand %xmm2, %xmm3 891; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 892; X86-SSE2-NEXT: psllq %xmm3, %xmm1 893; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 894; X86-SSE2-NEXT: psllq %xmm2, %xmm0 895; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 896; X86-SSE2-NEXT: orpd %xmm5, %xmm0 897; X86-SSE2-NEXT: retl 898 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer 899 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) 900 ret <2 x i64> %res 901} 902 903define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind { 904; SSE2-LABEL: splatvar_funnnel_v4i32: 905; SSE2: # %bb.0: 906; SSE2-NEXT: movd %xmm1, %eax 907; SSE2-NEXT: negl %eax 908; SSE2-NEXT: andl $31, %eax 909; SSE2-NEXT: movd %eax, %xmm1 910; SSE2-NEXT: movdqa %xmm0, %xmm2 911; SSE2-NEXT: pslld %xmm1, %xmm2 912; SSE2-NEXT: movl $32, %ecx 913; SSE2-NEXT: subl %eax, %ecx 914; SSE2-NEXT: movd %ecx, %xmm1 915; SSE2-NEXT: psrld %xmm1, %xmm0 916; SSE2-NEXT: por %xmm2, %xmm0 917; SSE2-NEXT: retq 918; 919; SSE41-LABEL: splatvar_funnnel_v4i32: 920; SSE41: # %bb.0: 921; SSE41-NEXT: pxor %xmm2, %xmm2 922; SSE41-NEXT: psubd %xmm1, %xmm2 923; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 924; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero 925; SSE41-NEXT: movdqa %xmm0, %xmm3 926; SSE41-NEXT: pslld %xmm1, %xmm3 927; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32] 928; SSE41-NEXT: psubd %xmm2, %xmm1 929; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 930; SSE41-NEXT: psrld %xmm1, %xmm0 931; SSE41-NEXT: por %xmm3, %xmm0 932; SSE41-NEXT: retq 933; 934; AVX1-LABEL: splatvar_funnnel_v4i32: 935; AVX1: # %bb.0: 936; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 937; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 938; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 939; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 940; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 941; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] 942; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 943; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 944; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 945; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 946; AVX1-NEXT: retq 947; 948; AVX2-LABEL: splatvar_funnnel_v4i32: 949; AVX2: # %bb.0: 950; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 951; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 952; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 953; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 954; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 955; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 956; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 957; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 958; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 959; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 960; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 961; AVX2-NEXT: retq 962; 963; AVX512F-LABEL: splatvar_funnnel_v4i32: 964; AVX512F: # %bb.0: 965; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 966; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 967; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 968; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 969; AVX512F-NEXT: vzeroupper 970; AVX512F-NEXT: retq 971; 972; AVX512VL-LABEL: splatvar_funnnel_v4i32: 973; AVX512VL: # %bb.0: 974; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 975; AVX512VL-NEXT: vprorvd %xmm1, %xmm0, %xmm0 976; AVX512VL-NEXT: retq 977; 978; AVX512BW-LABEL: splatvar_funnnel_v4i32: 979; AVX512BW: # %bb.0: 980; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 981; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 982; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 983; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 984; AVX512BW-NEXT: vzeroupper 985; AVX512BW-NEXT: retq 986; 987; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: 988; AVX512VLBW: # %bb.0: 989; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1 990; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0 991; AVX512VLBW-NEXT: retq 992; 993; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: 994; AVX512VBMI2: # %bb.0: 995; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 996; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 997; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 998; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 999; AVX512VBMI2-NEXT: vzeroupper 1000; AVX512VBMI2-NEXT: retq 1001; 1002; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32: 1003; AVX512VLVBMI2: # %bb.0: 1004; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 1005; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0 1006; AVX512VLVBMI2-NEXT: retq 1007; 1008; XOPAVX1-LABEL: splatvar_funnnel_v4i32: 1009; XOPAVX1: # %bb.0: 1010; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1011; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 1012; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 1013; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 1014; XOPAVX1-NEXT: retq 1015; 1016; XOPAVX2-LABEL: splatvar_funnnel_v4i32: 1017; XOPAVX2: # %bb.0: 1018; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1019; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 1020; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 1021; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 1022; XOPAVX2-NEXT: retq 1023; 1024; X86-SSE2-LABEL: splatvar_funnnel_v4i32: 1025; X86-SSE2: # %bb.0: 1026; X86-SSE2-NEXT: movd %xmm1, %eax 1027; X86-SSE2-NEXT: negl %eax 1028; X86-SSE2-NEXT: andl $31, %eax 1029; X86-SSE2-NEXT: movd %eax, %xmm1 1030; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1031; X86-SSE2-NEXT: pslld %xmm1, %xmm2 1032; X86-SSE2-NEXT: movl $32, %ecx 1033; X86-SSE2-NEXT: subl %eax, %ecx 1034; X86-SSE2-NEXT: movd %ecx, %xmm1 1035; X86-SSE2-NEXT: psrld %xmm1, %xmm0 1036; X86-SSE2-NEXT: por %xmm2, %xmm0 1037; X86-SSE2-NEXT: retl 1038 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer 1039 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat) 1040 ret <4 x i32> %res 1041} 1042 1043define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { 1044; SSE2-LABEL: splatvar_funnnel_v8i16: 1045; SSE2: # %bb.0: 1046; SSE2-NEXT: pxor %xmm2, %xmm2 1047; SSE2-NEXT: psubw %xmm1, %xmm2 1048; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1049; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] 1050; SSE2-NEXT: pand %xmm2, %xmm1 1051; SSE2-NEXT: movdqa %xmm0, %xmm3 1052; SSE2-NEXT: psllw %xmm1, %xmm3 1053; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] 1054; SSE2-NEXT: psubw %xmm2, %xmm1 1055; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1056; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1057; SSE2-NEXT: psrlw %xmm1, %xmm0 1058; SSE2-NEXT: por %xmm3, %xmm0 1059; SSE2-NEXT: retq 1060; 1061; SSE41-LABEL: splatvar_funnnel_v8i16: 1062; SSE41: # %bb.0: 1063; SSE41-NEXT: pxor %xmm2, %xmm2 1064; SSE41-NEXT: psubw %xmm1, %xmm2 1065; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 1066; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1067; SSE41-NEXT: movdqa %xmm0, %xmm3 1068; SSE41-NEXT: psllw %xmm1, %xmm3 1069; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] 1070; SSE41-NEXT: psubw %xmm2, %xmm1 1071; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1072; SSE41-NEXT: psrlw %xmm1, %xmm0 1073; SSE41-NEXT: por %xmm3, %xmm0 1074; SSE41-NEXT: retq 1075; 1076; AVX-LABEL: splatvar_funnnel_v8i16: 1077; AVX: # %bb.0: 1078; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 1079; AVX-NEXT: vpsubw %xmm1, %xmm2, %xmm1 1080; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1081; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1082; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1083; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1084; AVX-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1085; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1086; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1087; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1088; AVX-NEXT: retq 1089; 1090; AVX512F-LABEL: splatvar_funnnel_v8i16: 1091; AVX512F: # %bb.0: 1092; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 1093; AVX512F-NEXT: vpsubw %xmm1, %xmm2, %xmm1 1094; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1095; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1096; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1097; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1098; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1099; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1100; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1101; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 1102; AVX512F-NEXT: retq 1103; 1104; AVX512VL-LABEL: splatvar_funnnel_v8i16: 1105; AVX512VL: # %bb.0: 1106; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 1107; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm1 1108; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1109; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1110; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1111; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1112; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1113; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1114; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1115; AVX512VL-NEXT: vpor %xmm0, %xmm2, %xmm0 1116; AVX512VL-NEXT: retq 1117; 1118; AVX512BW-LABEL: splatvar_funnnel_v8i16: 1119; AVX512BW: # %bb.0: 1120; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1121; AVX512BW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 1122; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1123; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1124; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1125; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1126; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1127; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1128; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1129; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 1130; AVX512BW-NEXT: retq 1131; 1132; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: 1133; AVX512VLBW: # %bb.0: 1134; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1135; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1 1136; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1137; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1138; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1139; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1140; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1141; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1142; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1143; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 1144; AVX512VLBW-NEXT: retq 1145; 1146; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16: 1147; AVX512VBMI2: # %bb.0: 1148; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1149; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 1150; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 1151; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1152; AVX512VBMI2-NEXT: vzeroupper 1153; AVX512VBMI2-NEXT: retq 1154; 1155; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16: 1156; AVX512VLVBMI2: # %bb.0: 1157; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 1158; AVX512VLVBMI2-NEXT: vpshrdvw %xmm1, %xmm0, %xmm0 1159; AVX512VLVBMI2-NEXT: retq 1160; 1161; XOPAVX1-LABEL: splatvar_funnnel_v8i16: 1162; XOPAVX1: # %bb.0: 1163; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1164; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 1165; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1166; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 1167; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 1168; XOPAVX1-NEXT: retq 1169; 1170; XOPAVX2-LABEL: splatvar_funnnel_v8i16: 1171; XOPAVX2: # %bb.0: 1172; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1173; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 1174; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 1175; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 1176; XOPAVX2-NEXT: retq 1177; 1178; X86-SSE2-LABEL: splatvar_funnnel_v8i16: 1179; X86-SSE2: # %bb.0: 1180; X86-SSE2-NEXT: pxor %xmm2, %xmm2 1181; X86-SSE2-NEXT: psubw %xmm1, %xmm2 1182; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 1183; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] 1184; X86-SSE2-NEXT: pand %xmm2, %xmm1 1185; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 1186; X86-SSE2-NEXT: psllw %xmm1, %xmm3 1187; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] 1188; X86-SSE2-NEXT: psubw %xmm2, %xmm1 1189; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1190; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1191; X86-SSE2-NEXT: psrlw %xmm1, %xmm0 1192; X86-SSE2-NEXT: por %xmm3, %xmm0 1193; X86-SSE2-NEXT: retl 1194 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer 1195 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %splat) 1196 ret <8 x i16> %res 1197} 1198 1199define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { 1200; SSE2-LABEL: splatvar_funnnel_v16i8: 1201; SSE2: # %bb.0: 1202; SSE2-NEXT: pxor %xmm2, %xmm2 1203; SSE2-NEXT: psubb %xmm1, %xmm2 1204; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1205; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1206; SSE2-NEXT: psubb %xmm2, %xmm3 1207; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] 1208; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1209; SSE2-NEXT: movdqa %xmm0, %xmm1 1210; SSE2-NEXT: psllw %xmm2, %xmm1 1211; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1212; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 1213; SSE2-NEXT: psllw %xmm2, %xmm5 1214; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1215; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7] 1216; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1217; SSE2-NEXT: pand %xmm2, %xmm1 1218; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] 1219; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1220; SSE2-NEXT: psrlw %xmm3, %xmm0 1221; SSE2-NEXT: psrlw %xmm3, %xmm4 1222; SSE2-NEXT: psrlw $8, %xmm4 1223; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1224; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] 1225; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1226; SSE2-NEXT: pand %xmm0, %xmm2 1227; SSE2-NEXT: por %xmm2, %xmm1 1228; SSE2-NEXT: movdqa %xmm1, %xmm0 1229; SSE2-NEXT: retq 1230; 1231; SSE41-LABEL: splatvar_funnnel_v16i8: 1232; SSE41: # %bb.0: 1233; SSE41-NEXT: pxor %xmm2, %xmm2 1234; SSE41-NEXT: pxor %xmm3, %xmm3 1235; SSE41-NEXT: psubb %xmm1, %xmm3 1236; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 1237; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1238; SSE41-NEXT: movdqa %xmm0, %xmm1 1239; SSE41-NEXT: psllw %xmm4, %xmm1 1240; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 1241; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 1242; SSE41-NEXT: psllw %xmm4, %xmm6 1243; SSE41-NEXT: pshufb %xmm2, %xmm6 1244; SSE41-NEXT: pand %xmm6, %xmm1 1245; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1246; SSE41-NEXT: psubb %xmm3, %xmm2 1247; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1248; SSE41-NEXT: psrlw %xmm2, %xmm0 1249; SSE41-NEXT: psrlw %xmm2, %xmm5 1250; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1251; SSE41-NEXT: pand %xmm0, %xmm5 1252; SSE41-NEXT: por %xmm5, %xmm1 1253; SSE41-NEXT: movdqa %xmm1, %xmm0 1254; SSE41-NEXT: retq 1255; 1256; AVX1-LABEL: splatvar_funnnel_v16i8: 1257; AVX1: # %bb.0: 1258; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1259; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1260; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1261; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1262; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 1263; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 1264; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm3 1265; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1266; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 1267; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1268; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1269; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1270; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1271; AVX1-NEXT: vpsrlw %xmm1, %xmm5, %xmm1 1272; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1273; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1274; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1275; AVX1-NEXT: retq 1276; 1277; AVX2-LABEL: splatvar_funnnel_v16i8: 1278; AVX2: # %bb.0: 1279; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1280; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1281; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 1282; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1283; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 1284; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1285; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2 1286; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1287; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm2 1288; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1289; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1290; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1291; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1292; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 1293; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 1294; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1295; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1296; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 1297; AVX2-NEXT: retq 1298; 1299; AVX512F-LABEL: splatvar_funnnel_v16i8: 1300; AVX512F: # %bb.0: 1301; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1302; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 1303; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1304; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1305; AVX512F-NEXT: vpsrld %xmm3, %zmm0, %zmm3 1306; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 1307; AVX512F-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1308; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 1309; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1310; AVX512F-NEXT: vpslld %xmm1, %zmm0, %zmm0 1311; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 1312; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1313; AVX512F-NEXT: vzeroupper 1314; AVX512F-NEXT: retq 1315; 1316; AVX512VL-LABEL: splatvar_funnnel_v16i8: 1317; AVX512VL: # %bb.0: 1318; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1319; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 1320; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1321; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1322; AVX512VL-NEXT: vpsrld %xmm3, %zmm0, %zmm3 1323; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 1324; AVX512VL-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1325; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 1326; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1327; AVX512VL-NEXT: vpslld %xmm1, %zmm0, %zmm0 1328; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 1329; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 1330; AVX512VL-NEXT: vzeroupper 1331; AVX512VL-NEXT: retq 1332; 1333; AVX512BW-LABEL: splatvar_funnnel_v16i8: 1334; AVX512BW: # %bb.0: 1335; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1336; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 1337; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1338; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1339; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 1340; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 1341; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1342; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 1343; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1344; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 1345; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0 1346; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1347; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1348; AVX512BW-NEXT: vzeroupper 1349; AVX512BW-NEXT: retq 1350; 1351; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: 1352; AVX512VLBW: # %bb.0: 1353; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1354; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 1355; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1356; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1357; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 1358; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 1359; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1360; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 1361; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1362; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 1363; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0 1364; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1365; AVX512VLBW-NEXT: vzeroupper 1366; AVX512VLBW-NEXT: retq 1367; 1368; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: 1369; AVX512VBMI2: # %bb.0: 1370; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1371; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 1372; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1373; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1374; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 1375; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1376; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1377; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 1378; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1379; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 1380; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 1381; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 1382; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1383; AVX512VBMI2-NEXT: vzeroupper 1384; AVX512VBMI2-NEXT: retq 1385; 1386; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: 1387; AVX512VLVBMI2: # %bb.0: 1388; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1389; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 1390; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1391; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1392; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 1393; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1394; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 1395; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 1396; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1397; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 1398; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 1399; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 1400; AVX512VLVBMI2-NEXT: vzeroupper 1401; AVX512VLVBMI2-NEXT: retq 1402; 1403; XOPAVX1-LABEL: splatvar_funnnel_v16i8: 1404; XOPAVX1: # %bb.0: 1405; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1406; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1407; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1408; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 1409; XOPAVX1-NEXT: retq 1410; 1411; XOPAVX2-LABEL: splatvar_funnnel_v16i8: 1412; XOPAVX2: # %bb.0: 1413; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1414; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 1415; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1416; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 1417; XOPAVX2-NEXT: retq 1418; 1419; X86-SSE2-LABEL: splatvar_funnnel_v16i8: 1420; X86-SSE2: # %bb.0: 1421; X86-SSE2-NEXT: pxor %xmm2, %xmm2 1422; X86-SSE2-NEXT: psubb %xmm1, %xmm2 1423; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 1424; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1425; X86-SSE2-NEXT: psubb %xmm2, %xmm3 1426; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] 1427; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1428; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1429; X86-SSE2-NEXT: psllw %xmm2, %xmm1 1430; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1431; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 1432; X86-SSE2-NEXT: psllw %xmm2, %xmm5 1433; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1434; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7] 1435; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1436; X86-SSE2-NEXT: pand %xmm2, %xmm1 1437; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] 1438; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1439; X86-SSE2-NEXT: psrlw %xmm3, %xmm0 1440; X86-SSE2-NEXT: psrlw %xmm3, %xmm4 1441; X86-SSE2-NEXT: psrlw $8, %xmm4 1442; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1443; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] 1444; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1445; X86-SSE2-NEXT: pand %xmm0, %xmm2 1446; X86-SSE2-NEXT: por %xmm2, %xmm1 1447; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 1448; X86-SSE2-NEXT: retl 1449 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer 1450 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat) 1451 ret <16 x i8> %res 1452} 1453 1454; 1455; Constant Shifts 1456; 1457 1458define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { 1459; SSE2-LABEL: constant_funnnel_v2i64: 1460; SSE2: # %bb.0: 1461; SSE2-NEXT: movdqa %xmm0, %xmm1 1462; SSE2-NEXT: psllq $60, %xmm1 1463; SSE2-NEXT: movdqa %xmm0, %xmm2 1464; SSE2-NEXT: psllq $50, %xmm2 1465; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1466; SSE2-NEXT: movdqa %xmm0, %xmm1 1467; SSE2-NEXT: psrlq $4, %xmm1 1468; SSE2-NEXT: psrlq $14, %xmm0 1469; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1470; SSE2-NEXT: orpd %xmm2, %xmm0 1471; SSE2-NEXT: retq 1472; 1473; SSE41-LABEL: constant_funnnel_v2i64: 1474; SSE41: # %bb.0: 1475; SSE41-NEXT: movdqa %xmm0, %xmm1 1476; SSE41-NEXT: psllq $50, %xmm1 1477; SSE41-NEXT: movdqa %xmm0, %xmm2 1478; SSE41-NEXT: psllq $60, %xmm2 1479; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1480; SSE41-NEXT: movdqa %xmm0, %xmm1 1481; SSE41-NEXT: psrlq $14, %xmm1 1482; SSE41-NEXT: psrlq $4, %xmm0 1483; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1484; SSE41-NEXT: por %xmm2, %xmm0 1485; SSE41-NEXT: retq 1486; 1487; AVX1-LABEL: constant_funnnel_v2i64: 1488; AVX1: # %bb.0: 1489; AVX1-NEXT: vpsllq $50, %xmm0, %xmm1 1490; AVX1-NEXT: vpsllq $60, %xmm0, %xmm2 1491; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1492; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm2 1493; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0 1494; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1495; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1496; AVX1-NEXT: retq 1497; 1498; AVX2-LABEL: constant_funnnel_v2i64: 1499; AVX2: # %bb.0: 1500; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 1501; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 1502; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1503; AVX2-NEXT: retq 1504; 1505; AVX512F-LABEL: constant_funnnel_v2i64: 1506; AVX512F: # %bb.0: 1507; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1508; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1509; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0 1510; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1511; AVX512F-NEXT: vzeroupper 1512; AVX512F-NEXT: retq 1513; 1514; AVX512VL-LABEL: constant_funnnel_v2i64: 1515; AVX512VL: # %bb.0: 1516; AVX512VL-NEXT: vprorvq {{.*}}(%rip), %xmm0, %xmm0 1517; AVX512VL-NEXT: retq 1518; 1519; AVX512BW-LABEL: constant_funnnel_v2i64: 1520; AVX512BW: # %bb.0: 1521; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1522; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1523; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0 1524; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1525; AVX512BW-NEXT: vzeroupper 1526; AVX512BW-NEXT: retq 1527; 1528; AVX512VLBW-LABEL: constant_funnnel_v2i64: 1529; AVX512VLBW: # %bb.0: 1530; AVX512VLBW-NEXT: vprorvq {{.*}}(%rip), %xmm0, %xmm0 1531; AVX512VLBW-NEXT: retq 1532; 1533; AVX512VBMI2-LABEL: constant_funnnel_v2i64: 1534; AVX512VBMI2: # %bb.0: 1535; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1536; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1537; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 1538; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1539; AVX512VBMI2-NEXT: vzeroupper 1540; AVX512VBMI2-NEXT: retq 1541; 1542; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64: 1543; AVX512VLVBMI2: # %bb.0: 1544; AVX512VLVBMI2-NEXT: vprorvq {{.*}}(%rip), %xmm0, %xmm0 1545; AVX512VLVBMI2-NEXT: retq 1546; 1547; XOP-LABEL: constant_funnnel_v2i64: 1548; XOP: # %bb.0: 1549; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 1550; XOP-NEXT: retq 1551; 1552; X86-SSE2-LABEL: constant_funnnel_v2i64: 1553; X86-SSE2: # %bb.0: 1554; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,0,63,0] 1555; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <4,u,14,u> 1556; X86-SSE2-NEXT: pxor %xmm3, %xmm3 1557; X86-SSE2-NEXT: psubq %xmm2, %xmm3 1558; X86-SSE2-NEXT: pand %xmm1, %xmm2 1559; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 1560; X86-SSE2-NEXT: psrlq %xmm2, %xmm4 1561; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 1562; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 1563; X86-SSE2-NEXT: psrlq %xmm2, %xmm5 1564; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] 1565; X86-SSE2-NEXT: pand %xmm1, %xmm3 1566; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1567; X86-SSE2-NEXT: psllq %xmm3, %xmm1 1568; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1569; X86-SSE2-NEXT: psllq %xmm2, %xmm0 1570; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1571; X86-SSE2-NEXT: orpd %xmm5, %xmm0 1572; X86-SSE2-NEXT: retl 1573 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 4, i64 14>) 1574 ret <2 x i64> %res 1575} 1576 1577define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { 1578; SSE2-LABEL: constant_funnnel_v4i32: 1579; SSE2: # %bb.0: 1580; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432] 1581; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1582; SSE2-NEXT: pmuludq %xmm1, %xmm0 1583; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1584; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1585; SSE2-NEXT: pmuludq %xmm2, %xmm1 1586; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1587; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1588; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1589; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1590; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1591; SSE2-NEXT: por %xmm3, %xmm0 1592; SSE2-NEXT: retq 1593; 1594; SSE41-LABEL: constant_funnnel_v4i32: 1595; SSE41: # %bb.0: 1596; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432] 1597; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1598; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1599; SSE41-NEXT: pmuludq %xmm2, %xmm3 1600; SSE41-NEXT: pmuludq %xmm1, %xmm0 1601; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1602; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1603; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1604; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1605; SSE41-NEXT: por %xmm1, %xmm0 1606; SSE41-NEXT: retq 1607; 1608; AVX1-LABEL: constant_funnnel_v4i32: 1609; AVX1: # %bb.0: 1610; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432] 1611; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1612; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1613; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 1614; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1615; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1616; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1617; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 1618; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1619; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1620; AVX1-NEXT: retq 1621; 1622; AVX2-LABEL: constant_funnnel_v4i32: 1623; AVX2: # %bb.0: 1624; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 1625; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 1626; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1627; AVX2-NEXT: retq 1628; 1629; AVX512F-LABEL: constant_funnnel_v4i32: 1630; AVX512F: # %bb.0: 1631; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1632; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1633; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 1634; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1635; AVX512F-NEXT: vzeroupper 1636; AVX512F-NEXT: retq 1637; 1638; AVX512VL-LABEL: constant_funnnel_v4i32: 1639; AVX512VL: # %bb.0: 1640; AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 1641; AVX512VL-NEXT: retq 1642; 1643; AVX512BW-LABEL: constant_funnnel_v4i32: 1644; AVX512BW: # %bb.0: 1645; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1646; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1647; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 1648; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1649; AVX512BW-NEXT: vzeroupper 1650; AVX512BW-NEXT: retq 1651; 1652; AVX512VLBW-LABEL: constant_funnnel_v4i32: 1653; AVX512VLBW: # %bb.0: 1654; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 1655; AVX512VLBW-NEXT: retq 1656; 1657; AVX512VBMI2-LABEL: constant_funnnel_v4i32: 1658; AVX512VBMI2: # %bb.0: 1659; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1660; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1661; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 1662; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1663; AVX512VBMI2-NEXT: vzeroupper 1664; AVX512VBMI2-NEXT: retq 1665; 1666; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32: 1667; AVX512VLVBMI2: # %bb.0: 1668; AVX512VLVBMI2-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 1669; AVX512VLVBMI2-NEXT: retq 1670; 1671; XOP-LABEL: constant_funnnel_v4i32: 1672; XOP: # %bb.0: 1673; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 1674; XOP-NEXT: retq 1675; 1676; X86-SSE2-LABEL: constant_funnnel_v4i32: 1677; X86-SSE2: # %bb.0: 1678; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432] 1679; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1680; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 1681; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1682; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1683; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 1684; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1685; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1686; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1687; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1688; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1689; X86-SSE2-NEXT: por %xmm3, %xmm0 1690; X86-SSE2-NEXT: retl 1691 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 5, i32 6, i32 7>) 1692 ret <4 x i32> %res 1693} 1694 1695define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { 1696; SSE-LABEL: constant_funnnel_v8i16: 1697; SSE: # %bb.0: 1698; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512] 1699; SSE-NEXT: movdqa %xmm0, %xmm2 1700; SSE-NEXT: pmulhuw %xmm1, %xmm2 1701; SSE-NEXT: pmullw %xmm1, %xmm0 1702; SSE-NEXT: por %xmm2, %xmm0 1703; SSE-NEXT: retq 1704; 1705; AVX-LABEL: constant_funnnel_v8i16: 1706; AVX: # %bb.0: 1707; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512] 1708; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1709; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1710; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 1711; AVX-NEXT: retq 1712; 1713; AVX512F-LABEL: constant_funnnel_v8i16: 1714; AVX512F: # %bb.0: 1715; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512] 1716; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1717; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1718; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 1719; AVX512F-NEXT: retq 1720; 1721; AVX512VL-LABEL: constant_funnnel_v8i16: 1722; AVX512VL: # %bb.0: 1723; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512] 1724; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1725; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1726; AVX512VL-NEXT: vpor %xmm2, %xmm0, %xmm0 1727; AVX512VL-NEXT: retq 1728; 1729; AVX512BW-LABEL: constant_funnnel_v8i16: 1730; AVX512BW: # %bb.0: 1731; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1732; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [16,1,2,3,4,5,6,7] 1733; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 1734; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9] 1735; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1736; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1737; AVX512BW-NEXT: vzeroupper 1738; AVX512BW-NEXT: retq 1739; 1740; AVX512VLBW-LABEL: constant_funnnel_v8i16: 1741; AVX512VLBW: # %bb.0: 1742; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1 1743; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 1744; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1745; AVX512VLBW-NEXT: retq 1746; 1747; AVX512VBMI2-LABEL: constant_funnnel_v8i16: 1748; AVX512VBMI2: # %bb.0: 1749; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1750; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1751; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 1752; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1753; AVX512VBMI2-NEXT: vzeroupper 1754; AVX512VBMI2-NEXT: retq 1755; 1756; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16: 1757; AVX512VLVBMI2: # %bb.0: 1758; AVX512VLVBMI2-NEXT: vpshrdvw {{.*}}(%rip), %xmm0, %xmm0 1759; AVX512VLVBMI2-NEXT: retq 1760; 1761; XOP-LABEL: constant_funnnel_v8i16: 1762; XOP: # %bb.0: 1763; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 1764; XOP-NEXT: retq 1765; 1766; X86-SSE2-LABEL: constant_funnnel_v8i16: 1767; X86-SSE2: # %bb.0: 1768; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512] 1769; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1770; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 1771; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 1772; X86-SSE2-NEXT: por %xmm2, %xmm0 1773; X86-SSE2-NEXT: retl 1774 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>) 1775 ret <8 x i16> %res 1776} 1777 1778define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { 1779; SSE2-LABEL: constant_funnnel_v16i8: 1780; SSE2: # %bb.0: 1781; SSE2-NEXT: pxor %xmm1, %xmm1 1782; SSE2-NEXT: movdqa %xmm0, %xmm2 1783; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1784; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 1785; SSE2-NEXT: psrlw $8, %xmm2 1786; SSE2-NEXT: movdqa %xmm0, %xmm3 1787; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1788; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3 1789; SSE2-NEXT: psrlw $8, %xmm3 1790; SSE2-NEXT: packuswb %xmm2, %xmm3 1791; SSE2-NEXT: movdqa %xmm0, %xmm1 1792; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1793; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 1794; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1795; SSE2-NEXT: pand %xmm2, %xmm1 1796; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1797; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 1798; SSE2-NEXT: pand %xmm2, %xmm0 1799; SSE2-NEXT: packuswb %xmm1, %xmm0 1800; SSE2-NEXT: por %xmm3, %xmm0 1801; SSE2-NEXT: retq 1802; 1803; SSE41-LABEL: constant_funnnel_v16i8: 1804; SSE41: # %bb.0: 1805; SSE41-NEXT: movdqa %xmm0, %xmm2 1806; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1807; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 1808; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1809; SSE41-NEXT: pand %xmm3, %xmm2 1810; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1811; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] 1812; SSE41-NEXT: pmullw %xmm1, %xmm4 1813; SSE41-NEXT: pand %xmm3, %xmm4 1814; SSE41-NEXT: packuswb %xmm2, %xmm4 1815; SSE41-NEXT: pxor %xmm2, %xmm2 1816; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1817; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 1818; SSE41-NEXT: psrlw $8, %xmm0 1819; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 1820; SSE41-NEXT: psrlw $8, %xmm1 1821; SSE41-NEXT: packuswb %xmm0, %xmm1 1822; SSE41-NEXT: por %xmm4, %xmm1 1823; SSE41-NEXT: movdqa %xmm1, %xmm0 1824; SSE41-NEXT: retq 1825; 1826; AVX1-LABEL: constant_funnnel_v16i8: 1827; AVX1: # %bb.0: 1828; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1829; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 1830; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1831; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1832; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1833; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm4 1834; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 1835; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 1836; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1837; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1838; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 1839; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1840; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm2 1841; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1842; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 1843; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 1844; AVX1-NEXT: retq 1845; 1846; AVX2-LABEL: constant_funnnel_v16i8: 1847; AVX2: # %bb.0: 1848; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1849; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1 1850; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1851; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1852; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1853; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1854; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1855; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1856; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1857; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1858; AVX2-NEXT: vzeroupper 1859; AVX2-NEXT: retq 1860; 1861; AVX512F-LABEL: constant_funnnel_v16i8: 1862; AVX512F: # %bb.0: 1863; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1864; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1 1865; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 1866; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 1867; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1868; AVX512F-NEXT: vzeroupper 1869; AVX512F-NEXT: retq 1870; 1871; AVX512VL-LABEL: constant_funnnel_v16i8: 1872; AVX512VL: # %bb.0: 1873; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1874; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1 1875; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 1876; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 1877; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 1878; AVX512VL-NEXT: vzeroupper 1879; AVX512VL-NEXT: retq 1880; 1881; AVX512BW-LABEL: constant_funnnel_v16i8: 1882; AVX512BW: # %bb.0: 1883; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1884; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1885; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 1886; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 1887; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 1888; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1889; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1890; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1891; AVX512BW-NEXT: vzeroupper 1892; AVX512BW-NEXT: retq 1893; 1894; AVX512VLBW-LABEL: constant_funnnel_v16i8: 1895; AVX512VLBW: # %bb.0: 1896; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1897; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1 1898; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 1899; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1900; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1901; AVX512VLBW-NEXT: vzeroupper 1902; AVX512VLBW-NEXT: retq 1903; 1904; AVX512VBMI2-LABEL: constant_funnnel_v16i8: 1905; AVX512VBMI2: # %bb.0: 1906; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1907; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1908; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 1909; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 1910; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 1911; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 1912; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 1913; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1914; AVX512VBMI2-NEXT: vzeroupper 1915; AVX512VBMI2-NEXT: retq 1916; 1917; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8: 1918; AVX512VLVBMI2: # %bb.0: 1919; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1920; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1 1921; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 1922; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 1923; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 1924; AVX512VLVBMI2-NEXT: vzeroupper 1925; AVX512VLVBMI2-NEXT: retq 1926; 1927; XOP-LABEL: constant_funnnel_v16i8: 1928; XOP: # %bb.0: 1929; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 1930; XOP-NEXT: retq 1931; 1932; X86-SSE2-LABEL: constant_funnnel_v16i8: 1933; X86-SSE2: # %bb.0: 1934; X86-SSE2-NEXT: pxor %xmm1, %xmm1 1935; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1936; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1937; X86-SSE2-NEXT: pmullw {{\.LCPI.*}}, %xmm2 1938; X86-SSE2-NEXT: psrlw $8, %xmm2 1939; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 1940; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1941; X86-SSE2-NEXT: pmullw {{\.LCPI.*}}, %xmm3 1942; X86-SSE2-NEXT: psrlw $8, %xmm3 1943; X86-SSE2-NEXT: packuswb %xmm2, %xmm3 1944; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1945; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1946; X86-SSE2-NEXT: pmullw {{\.LCPI.*}}, %xmm1 1947; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1948; X86-SSE2-NEXT: pand %xmm2, %xmm1 1949; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1950; X86-SSE2-NEXT: pmullw {{\.LCPI.*}}, %xmm0 1951; X86-SSE2-NEXT: pand %xmm2, %xmm0 1952; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 1953; X86-SSE2-NEXT: por %xmm3, %xmm0 1954; X86-SSE2-NEXT: retl 1955 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>) 1956 ret <16 x i8> %res 1957} 1958 1959; 1960; Uniform Constant Shifts 1961; 1962 1963define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind { 1964; SSE-LABEL: splatconstant_funnnel_v2i64: 1965; SSE: # %bb.0: 1966; SSE-NEXT: movdqa %xmm0, %xmm1 1967; SSE-NEXT: psllq $50, %xmm1 1968; SSE-NEXT: psrlq $14, %xmm0 1969; SSE-NEXT: por %xmm1, %xmm0 1970; SSE-NEXT: retq 1971; 1972; AVX-LABEL: splatconstant_funnnel_v2i64: 1973; AVX: # %bb.0: 1974; AVX-NEXT: vpsllq $50, %xmm0, %xmm1 1975; AVX-NEXT: vpsrlq $14, %xmm0, %xmm0 1976; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1977; AVX-NEXT: retq 1978; 1979; AVX512F-LABEL: splatconstant_funnnel_v2i64: 1980; AVX512F: # %bb.0: 1981; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1982; AVX512F-NEXT: vprorq $14, %zmm0, %zmm0 1983; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1984; AVX512F-NEXT: vzeroupper 1985; AVX512F-NEXT: retq 1986; 1987; AVX512VL-LABEL: splatconstant_funnnel_v2i64: 1988; AVX512VL: # %bb.0: 1989; AVX512VL-NEXT: vprorq $14, %xmm0, %xmm0 1990; AVX512VL-NEXT: retq 1991; 1992; AVX512BW-LABEL: splatconstant_funnnel_v2i64: 1993; AVX512BW: # %bb.0: 1994; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1995; AVX512BW-NEXT: vprorq $14, %zmm0, %zmm0 1996; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1997; AVX512BW-NEXT: vzeroupper 1998; AVX512BW-NEXT: retq 1999; 2000; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64: 2001; AVX512VLBW: # %bb.0: 2002; AVX512VLBW-NEXT: vprorq $14, %xmm0, %xmm0 2003; AVX512VLBW-NEXT: retq 2004; 2005; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64: 2006; AVX512VBMI2: # %bb.0: 2007; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2008; AVX512VBMI2-NEXT: vprorq $14, %zmm0, %zmm0 2009; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2010; AVX512VBMI2-NEXT: vzeroupper 2011; AVX512VBMI2-NEXT: retq 2012; 2013; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64: 2014; AVX512VLVBMI2: # %bb.0: 2015; AVX512VLVBMI2-NEXT: vprorq $14, %xmm0, %xmm0 2016; AVX512VLVBMI2-NEXT: retq 2017; 2018; XOP-LABEL: splatconstant_funnnel_v2i64: 2019; XOP: # %bb.0: 2020; XOP-NEXT: vprotq $50, %xmm0, %xmm0 2021; XOP-NEXT: retq 2022; 2023; X86-SSE2-LABEL: splatconstant_funnnel_v2i64: 2024; X86-SSE2: # %bb.0: 2025; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2026; X86-SSE2-NEXT: psllq $50, %xmm1 2027; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] 2028; X86-SSE2-NEXT: psrlq $14, %xmm0 2029; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] 2030; X86-SSE2-NEXT: orpd %xmm1, %xmm0 2031; X86-SSE2-NEXT: retl 2032 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 14, i64 14>) 2033 ret <2 x i64> %res 2034} 2035 2036define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x) nounwind { 2037; SSE-LABEL: splatconstant_funnnel_v4i32: 2038; SSE: # %bb.0: 2039; SSE-NEXT: movdqa %xmm0, %xmm1 2040; SSE-NEXT: psrld $4, %xmm1 2041; SSE-NEXT: pslld $28, %xmm0 2042; SSE-NEXT: por %xmm1, %xmm0 2043; SSE-NEXT: retq 2044; 2045; AVX-LABEL: splatconstant_funnnel_v4i32: 2046; AVX: # %bb.0: 2047; AVX-NEXT: vpsrld $4, %xmm0, %xmm1 2048; AVX-NEXT: vpslld $28, %xmm0, %xmm0 2049; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2050; AVX-NEXT: retq 2051; 2052; AVX512F-LABEL: splatconstant_funnnel_v4i32: 2053; AVX512F: # %bb.0: 2054; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2055; AVX512F-NEXT: vprord $4, %zmm0, %zmm0 2056; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2057; AVX512F-NEXT: vzeroupper 2058; AVX512F-NEXT: retq 2059; 2060; AVX512VL-LABEL: splatconstant_funnnel_v4i32: 2061; AVX512VL: # %bb.0: 2062; AVX512VL-NEXT: vprord $4, %xmm0, %xmm0 2063; AVX512VL-NEXT: retq 2064; 2065; AVX512BW-LABEL: splatconstant_funnnel_v4i32: 2066; AVX512BW: # %bb.0: 2067; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2068; AVX512BW-NEXT: vprord $4, %zmm0, %zmm0 2069; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2070; AVX512BW-NEXT: vzeroupper 2071; AVX512BW-NEXT: retq 2072; 2073; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32: 2074; AVX512VLBW: # %bb.0: 2075; AVX512VLBW-NEXT: vprord $4, %xmm0, %xmm0 2076; AVX512VLBW-NEXT: retq 2077; 2078; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32: 2079; AVX512VBMI2: # %bb.0: 2080; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2081; AVX512VBMI2-NEXT: vprord $4, %zmm0, %zmm0 2082; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2083; AVX512VBMI2-NEXT: vzeroupper 2084; AVX512VBMI2-NEXT: retq 2085; 2086; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32: 2087; AVX512VLVBMI2: # %bb.0: 2088; AVX512VLVBMI2-NEXT: vprord $4, %xmm0, %xmm0 2089; AVX512VLVBMI2-NEXT: retq 2090; 2091; XOP-LABEL: splatconstant_funnnel_v4i32: 2092; XOP: # %bb.0: 2093; XOP-NEXT: vprotd $28, %xmm0, %xmm0 2094; XOP-NEXT: retq 2095; 2096; X86-SSE2-LABEL: splatconstant_funnnel_v4i32: 2097; X86-SSE2: # %bb.0: 2098; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2099; X86-SSE2-NEXT: psrld $4, %xmm1 2100; X86-SSE2-NEXT: pslld $28, %xmm0 2101; X86-SSE2-NEXT: por %xmm1, %xmm0 2102; X86-SSE2-NEXT: retl 2103 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 4, i32 4, i32 4>) 2104 ret <4 x i32> %res 2105} 2106 2107define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x) nounwind { 2108; SSE-LABEL: splatconstant_funnnel_v8i16: 2109; SSE: # %bb.0: 2110; SSE-NEXT: movdqa %xmm0, %xmm1 2111; SSE-NEXT: psrlw $7, %xmm1 2112; SSE-NEXT: psllw $9, %xmm0 2113; SSE-NEXT: por %xmm1, %xmm0 2114; SSE-NEXT: retq 2115; 2116; AVX-LABEL: splatconstant_funnnel_v8i16: 2117; AVX: # %bb.0: 2118; AVX-NEXT: vpsrlw $7, %xmm0, %xmm1 2119; AVX-NEXT: vpsllw $9, %xmm0, %xmm0 2120; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2121; AVX-NEXT: retq 2122; 2123; AVX512F-LABEL: splatconstant_funnnel_v8i16: 2124; AVX512F: # %bb.0: 2125; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1 2126; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0 2127; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2128; AVX512F-NEXT: retq 2129; 2130; AVX512VL-LABEL: splatconstant_funnnel_v8i16: 2131; AVX512VL: # %bb.0: 2132; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm1 2133; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0 2134; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2135; AVX512VL-NEXT: retq 2136; 2137; AVX512BW-LABEL: splatconstant_funnnel_v8i16: 2138; AVX512BW: # %bb.0: 2139; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1 2140; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0 2141; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2142; AVX512BW-NEXT: retq 2143; 2144; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16: 2145; AVX512VLBW: # %bb.0: 2146; AVX512VLBW-NEXT: vpsrlw $7, %xmm0, %xmm1 2147; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0 2148; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2149; AVX512VLBW-NEXT: retq 2150; 2151; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16: 2152; AVX512VBMI2: # %bb.0: 2153; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2154; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm0, %zmm0 2155; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2156; AVX512VBMI2-NEXT: vzeroupper 2157; AVX512VBMI2-NEXT: retq 2158; 2159; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16: 2160; AVX512VLVBMI2: # %bb.0: 2161; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm0, %xmm0 2162; AVX512VLVBMI2-NEXT: retq 2163; 2164; XOP-LABEL: splatconstant_funnnel_v8i16: 2165; XOP: # %bb.0: 2166; XOP-NEXT: vprotw $9, %xmm0, %xmm0 2167; XOP-NEXT: retq 2168; 2169; X86-SSE2-LABEL: splatconstant_funnnel_v8i16: 2170; X86-SSE2: # %bb.0: 2171; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2172; X86-SSE2-NEXT: psrlw $7, %xmm1 2173; X86-SSE2-NEXT: psllw $9, %xmm0 2174; X86-SSE2-NEXT: por %xmm1, %xmm0 2175; X86-SSE2-NEXT: retl 2176 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>) 2177 ret <8 x i16> %res 2178} 2179 2180define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { 2181; SSE-LABEL: splatconstant_funnnel_v16i8: 2182; SSE: # %bb.0: 2183; SSE-NEXT: movdqa %xmm0, %xmm1 2184; SSE-NEXT: psrlw $4, %xmm1 2185; SSE-NEXT: pand {{.*}}(%rip), %xmm1 2186; SSE-NEXT: psllw $4, %xmm0 2187; SSE-NEXT: pand {{.*}}(%rip), %xmm0 2188; SSE-NEXT: por %xmm1, %xmm0 2189; SSE-NEXT: retq 2190; 2191; AVX-LABEL: splatconstant_funnnel_v16i8: 2192; AVX: # %bb.0: 2193; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 2194; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2195; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 2196; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2197; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2198; AVX-NEXT: retq 2199; 2200; AVX512F-LABEL: splatconstant_funnnel_v16i8: 2201; AVX512F: # %bb.0: 2202; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 2203; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2204; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 2205; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2206; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2207; AVX512F-NEXT: retq 2208; 2209; AVX512VL-LABEL: splatconstant_funnnel_v16i8: 2210; AVX512VL: # %bb.0: 2211; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm1 2212; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm0 2213; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 2214; AVX512VL-NEXT: retq 2215; 2216; AVX512BW-LABEL: splatconstant_funnnel_v16i8: 2217; AVX512BW: # %bb.0: 2218; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 2219; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2220; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 2221; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2222; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2223; AVX512BW-NEXT: retq 2224; 2225; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8: 2226; AVX512VLBW: # %bb.0: 2227; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm1 2228; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm0 2229; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 2230; AVX512VLBW-NEXT: retq 2231; 2232; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: 2233; AVX512VBMI2: # %bb.0: 2234; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 2235; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 2236; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 2237; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 2238; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 2239; AVX512VBMI2-NEXT: retq 2240; 2241; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: 2242; AVX512VLVBMI2: # %bb.0: 2243; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm1 2244; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm0 2245; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 2246; AVX512VLVBMI2-NEXT: retq 2247; 2248; XOP-LABEL: splatconstant_funnnel_v16i8: 2249; XOP: # %bb.0: 2250; XOP-NEXT: vprotb $4, %xmm0, %xmm0 2251; XOP-NEXT: retq 2252; 2253; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: 2254; X86-SSE2: # %bb.0: 2255; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2256; X86-SSE2-NEXT: psrlw $4, %xmm1 2257; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 2258; X86-SSE2-NEXT: psllw $4, %xmm0 2259; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 2260; X86-SSE2-NEXT: por %xmm1, %xmm0 2261; X86-SSE2-NEXT: retl 2262 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) 2263 ret <16 x i8> %res 2264} 2265