1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 8 9declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>) 10declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>) 11declare {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32>, <3 x i32>) 12declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>) 13declare {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32>, <6 x i32>) 14declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>) 15declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>) 16 17declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>) 18declare {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>) 19declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>) 20declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>) 21declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>) 22 23declare {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24>, <4 x i24>) 24declare {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1>, <4 x i1>) 25declare {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128>, <2 x i128>) 26 27define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { 28; SSE-LABEL: umulo_v1i32: 29; SSE: # %bb.0: 30; SSE-NEXT: movq %rdx, %rcx 31; SSE-NEXT: movl %edi, %eax 32; SSE-NEXT: xorl %edi, %edi 33; SSE-NEXT: mull %esi 34; SSE-NEXT: seto %dil 35; SSE-NEXT: negl %edi 36; SSE-NEXT: movl %eax, (%rcx) 37; SSE-NEXT: movl %edi, %eax 38; SSE-NEXT: retq 39; 40; AVX-LABEL: umulo_v1i32: 41; AVX: # %bb.0: 42; AVX-NEXT: movq %rdx, %rcx 43; AVX-NEXT: movl %edi, %eax 44; AVX-NEXT: xorl %edi, %edi 45; AVX-NEXT: mull %esi 46; AVX-NEXT: seto %dil 47; AVX-NEXT: negl %edi 48; AVX-NEXT: movl %eax, (%rcx) 49; AVX-NEXT: movl %edi, %eax 50; AVX-NEXT: retq 51 %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) 52 %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 53 %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 54 %res = sext <1 x i1> %obit to <1 x i32> 55 store <1 x i32> %val, <1 x i32>* %p2 56 ret <1 x i32> %res 57} 58 59define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { 60; SSE2-LABEL: umulo_v2i32: 61; SSE2: # %bb.0: 62; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 63; SSE2-NEXT: pmuludq %xmm1, %xmm0 64; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 65; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 66; SSE2-NEXT: pmuludq %xmm2, %xmm4 67; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 68; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 69; SSE2-NEXT: pxor %xmm2, %xmm2 70; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 71; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 72; SSE2-NEXT: pxor %xmm2, %xmm1 73; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 74; SSE2-NEXT: movq %xmm0, (%rdi) 75; SSE2-NEXT: movdqa %xmm1, %xmm0 76; SSE2-NEXT: retq 77; 78; SSSE3-LABEL: umulo_v2i32: 79; SSSE3: # %bb.0: 80; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 81; SSSE3-NEXT: pmuludq %xmm1, %xmm0 82; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 83; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 84; SSSE3-NEXT: pmuludq %xmm2, %xmm4 85; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 86; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 87; SSSE3-NEXT: pxor %xmm2, %xmm2 88; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 89; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 90; SSSE3-NEXT: pxor %xmm2, %xmm1 91; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 92; SSSE3-NEXT: movq %xmm0, (%rdi) 93; SSSE3-NEXT: movdqa %xmm1, %xmm0 94; SSSE3-NEXT: retq 95; 96; SSE41-LABEL: umulo_v2i32: 97; SSE41: # %bb.0: 98; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 99; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 100; SSE41-NEXT: pmuludq %xmm2, %xmm3 101; SSE41-NEXT: movdqa %xmm0, %xmm2 102; SSE41-NEXT: pmuludq %xmm1, %xmm2 103; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 104; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 105; SSE41-NEXT: pxor %xmm3, %xmm3 106; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 107; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 108; SSE41-NEXT: pxor %xmm3, %xmm2 109; SSE41-NEXT: pmulld %xmm1, %xmm0 110; SSE41-NEXT: movq %xmm0, (%rdi) 111; SSE41-NEXT: movdqa %xmm2, %xmm0 112; SSE41-NEXT: retq 113; 114; AVX1-LABEL: umulo_v2i32: 115; AVX1: # %bb.0: 116; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 117; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 118; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 119; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 120; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 121; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 122; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 123; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 124; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 125; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 126; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 127; AVX1-NEXT: vmovq %xmm0, (%rdi) 128; AVX1-NEXT: vmovdqa %xmm2, %xmm0 129; AVX1-NEXT: retq 130; 131; AVX2-LABEL: umulo_v2i32: 132; AVX2: # %bb.0: 133; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 134; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 135; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 136; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 137; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 138; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 139; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 140; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 141; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 142; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 143; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 144; AVX2-NEXT: vmovq %xmm0, (%rdi) 145; AVX2-NEXT: vmovdqa %xmm2, %xmm0 146; AVX2-NEXT: retq 147; 148; AVX512-LABEL: umulo_v2i32: 149; AVX512: # %bb.0: 150; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 151; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 152; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 153; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 154; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 155; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 156; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 157; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 158; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 159; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 160; AVX512-NEXT: vmovq %xmm1, (%rdi) 161; AVX512-NEXT: retq 162 %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) 163 %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 164 %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 165 %res = sext <2 x i1> %obit to <2 x i32> 166 store <2 x i32> %val, <2 x i32>* %p2 167 ret <2 x i32> %res 168} 169 170define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { 171; SSE2-LABEL: umulo_v3i32: 172; SSE2: # %bb.0: 173; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 174; SSE2-NEXT: pmuludq %xmm1, %xmm0 175; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 176; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 177; SSE2-NEXT: pmuludq %xmm2, %xmm4 178; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 179; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 180; SSE2-NEXT: pxor %xmm2, %xmm2 181; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 182; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 183; SSE2-NEXT: pxor %xmm2, %xmm1 184; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 185; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 186; SSE2-NEXT: movd %xmm2, 8(%rdi) 187; SSE2-NEXT: movq %xmm0, (%rdi) 188; SSE2-NEXT: movdqa %xmm1, %xmm0 189; SSE2-NEXT: retq 190; 191; SSSE3-LABEL: umulo_v3i32: 192; SSSE3: # %bb.0: 193; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 194; SSSE3-NEXT: pmuludq %xmm1, %xmm0 195; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 196; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 197; SSSE3-NEXT: pmuludq %xmm2, %xmm4 198; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 199; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 200; SSSE3-NEXT: pxor %xmm2, %xmm2 201; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 202; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 203; SSSE3-NEXT: pxor %xmm2, %xmm1 204; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 205; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 206; SSSE3-NEXT: movd %xmm2, 8(%rdi) 207; SSSE3-NEXT: movq %xmm0, (%rdi) 208; SSSE3-NEXT: movdqa %xmm1, %xmm0 209; SSSE3-NEXT: retq 210; 211; SSE41-LABEL: umulo_v3i32: 212; SSE41: # %bb.0: 213; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 214; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 215; SSE41-NEXT: pmuludq %xmm2, %xmm3 216; SSE41-NEXT: movdqa %xmm0, %xmm2 217; SSE41-NEXT: pmuludq %xmm1, %xmm2 218; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 219; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 220; SSE41-NEXT: pxor %xmm3, %xmm3 221; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 222; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 223; SSE41-NEXT: pxor %xmm3, %xmm2 224; SSE41-NEXT: pmulld %xmm1, %xmm0 225; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) 226; SSE41-NEXT: movq %xmm0, (%rdi) 227; SSE41-NEXT: movdqa %xmm2, %xmm0 228; SSE41-NEXT: retq 229; 230; AVX1-LABEL: umulo_v3i32: 231; AVX1: # %bb.0: 232; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 233; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 234; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 235; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 236; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 237; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 238; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 239; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 240; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 241; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 242; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 243; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi) 244; AVX1-NEXT: vmovq %xmm0, (%rdi) 245; AVX1-NEXT: vmovdqa %xmm2, %xmm0 246; AVX1-NEXT: retq 247; 248; AVX2-LABEL: umulo_v3i32: 249; AVX2: # %bb.0: 250; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 251; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 252; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 253; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 254; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 255; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 256; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 257; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 258; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 259; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 260; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 261; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) 262; AVX2-NEXT: vmovq %xmm0, (%rdi) 263; AVX2-NEXT: vmovdqa %xmm2, %xmm0 264; AVX2-NEXT: retq 265; 266; AVX512-LABEL: umulo_v3i32: 267; AVX512: # %bb.0: 268; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 269; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 270; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 271; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 272; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 273; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 274; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 275; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 276; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 277; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 278; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) 279; AVX512-NEXT: vmovq %xmm1, (%rdi) 280; AVX512-NEXT: retq 281 %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) 282 %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 283 %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 284 %res = sext <3 x i1> %obit to <3 x i32> 285 store <3 x i32> %val, <3 x i32>* %p2 286 ret <3 x i32> %res 287} 288 289define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { 290; SSE2-LABEL: umulo_v4i32: 291; SSE2: # %bb.0: 292; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 293; SSE2-NEXT: pmuludq %xmm1, %xmm0 294; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 295; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 296; SSE2-NEXT: pmuludq %xmm2, %xmm4 297; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 298; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 299; SSE2-NEXT: pxor %xmm2, %xmm2 300; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 301; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 302; SSE2-NEXT: pxor %xmm2, %xmm1 303; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 304; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 305; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 306; SSE2-NEXT: movdqa %xmm0, (%rdi) 307; SSE2-NEXT: movdqa %xmm1, %xmm0 308; SSE2-NEXT: retq 309; 310; SSSE3-LABEL: umulo_v4i32: 311; SSSE3: # %bb.0: 312; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 313; SSSE3-NEXT: pmuludq %xmm1, %xmm0 314; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 315; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 316; SSSE3-NEXT: pmuludq %xmm2, %xmm4 317; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 318; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 319; SSSE3-NEXT: pxor %xmm2, %xmm2 320; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 321; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 322; SSSE3-NEXT: pxor %xmm2, %xmm1 323; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 324; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 325; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 326; SSSE3-NEXT: movdqa %xmm0, (%rdi) 327; SSSE3-NEXT: movdqa %xmm1, %xmm0 328; SSSE3-NEXT: retq 329; 330; SSE41-LABEL: umulo_v4i32: 331; SSE41: # %bb.0: 332; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 333; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 334; SSE41-NEXT: pmuludq %xmm2, %xmm3 335; SSE41-NEXT: movdqa %xmm0, %xmm2 336; SSE41-NEXT: pmuludq %xmm1, %xmm2 337; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 338; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 339; SSE41-NEXT: pxor %xmm3, %xmm3 340; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 341; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 342; SSE41-NEXT: pxor %xmm3, %xmm2 343; SSE41-NEXT: pmulld %xmm1, %xmm0 344; SSE41-NEXT: movdqa %xmm0, (%rdi) 345; SSE41-NEXT: movdqa %xmm2, %xmm0 346; SSE41-NEXT: retq 347; 348; AVX1-LABEL: umulo_v4i32: 349; AVX1: # %bb.0: 350; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 351; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 352; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 353; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 354; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 355; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 356; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 357; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 358; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 359; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 360; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 361; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 362; AVX1-NEXT: vmovdqa %xmm2, %xmm0 363; AVX1-NEXT: retq 364; 365; AVX2-LABEL: umulo_v4i32: 366; AVX2: # %bb.0: 367; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 368; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 369; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 370; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 371; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 372; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 373; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 374; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 375; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 376; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 377; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 378; AVX2-NEXT: vmovdqa %xmm0, (%rdi) 379; AVX2-NEXT: vmovdqa %xmm2, %xmm0 380; AVX2-NEXT: retq 381; 382; AVX512-LABEL: umulo_v4i32: 383; AVX512: # %bb.0: 384; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 385; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 386; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 387; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 388; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 389; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 390; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 391; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 392; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 393; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 394; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 395; AVX512-NEXT: retq 396 %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) 397 %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 398 %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 399 %res = sext <4 x i1> %obit to <4 x i32> 400 store <4 x i32> %val, <4 x i32>* %p2 401 ret <4 x i32> %res 402} 403 404define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { 405; SSE2-LABEL: umulo_v6i32: 406; SSE2: # %bb.0: 407; SSE2-NEXT: movq %rdi, %rax 408; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 409; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 410; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 411; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 412; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 413; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 414; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 415; SSE2-NEXT: movd %r8d, %xmm0 416; SSE2-NEXT: movd %ecx, %xmm1 417; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 418; SSE2-NEXT: movd %edx, %xmm0 419; SSE2-NEXT: movd %esi, %xmm3 420; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 421; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 422; SSE2-NEXT: movd %r9d, %xmm1 423; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx 424; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 425; SSE2-NEXT: pmuludq %xmm1, %xmm0 426; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 427; SSE2-NEXT: pmuludq %xmm2, %xmm3 428; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 429; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 430; SSE2-NEXT: pmuludq %xmm4, %xmm2 431; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 432; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 433; SSE2-NEXT: pxor %xmm4, %xmm4 434; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 435; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 436; SSE2-NEXT: pxor %xmm5, %xmm1 437; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 438; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 439; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 440; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 441; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero 442; SSE2-NEXT: pmuludq %xmm2, %xmm6 443; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 444; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] 445; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 446; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 447; SSE2-NEXT: pxor %xmm5, %xmm7 448; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 449; SSE2-NEXT: movq %xmm0, 16(%rcx) 450; SSE2-NEXT: movdqa %xmm3, (%rcx) 451; SSE2-NEXT: movq %xmm7, 16(%rdi) 452; SSE2-NEXT: movdqa %xmm1, (%rdi) 453; SSE2-NEXT: retq 454; 455; SSSE3-LABEL: umulo_v6i32: 456; SSSE3: # %bb.0: 457; SSSE3-NEXT: movq %rdi, %rax 458; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 459; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 460; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 461; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 462; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 463; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 464; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 465; SSSE3-NEXT: movd %r8d, %xmm0 466; SSSE3-NEXT: movd %ecx, %xmm1 467; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 468; SSSE3-NEXT: movd %edx, %xmm0 469; SSSE3-NEXT: movd %esi, %xmm3 470; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 471; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 472; SSSE3-NEXT: movd %r9d, %xmm1 473; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx 474; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 475; SSSE3-NEXT: pmuludq %xmm1, %xmm0 476; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 477; SSSE3-NEXT: pmuludq %xmm2, %xmm3 478; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 479; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 480; SSSE3-NEXT: pmuludq %xmm4, %xmm2 481; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 482; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 483; SSSE3-NEXT: pxor %xmm4, %xmm4 484; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 485; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 486; SSSE3-NEXT: pxor %xmm5, %xmm1 487; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 488; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 489; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 490; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 491; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero 492; SSSE3-NEXT: pmuludq %xmm2, %xmm6 493; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 494; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] 495; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 496; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 497; SSSE3-NEXT: pxor %xmm5, %xmm7 498; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 499; SSSE3-NEXT: movq %xmm0, 16(%rcx) 500; SSSE3-NEXT: movdqa %xmm3, (%rcx) 501; SSSE3-NEXT: movq %xmm7, 16(%rdi) 502; SSSE3-NEXT: movdqa %xmm1, (%rdi) 503; SSSE3-NEXT: retq 504; 505; SSE41-LABEL: umulo_v6i32: 506; SSE41: # %bb.0: 507; SSE41-NEXT: movq %rdi, %rax 508; SSE41-NEXT: movd %esi, %xmm2 509; SSE41-NEXT: pinsrd $1, %edx, %xmm2 510; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 511; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 512; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 513; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1 514; SSE41-NEXT: movdqa %xmm1, %xmm0 515; SSE41-NEXT: pmuludq %xmm2, %xmm1 516; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 517; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 518; SSE41-NEXT: movd %r9d, %xmm4 519; SSE41-NEXT: movdqa %xmm4, %xmm5 520; SSE41-NEXT: pmuludq %xmm3, %xmm4 521; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 522; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm5 523; SSE41-NEXT: pmulld %xmm3, %xmm5 524; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm0 525; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx 526; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 527; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] 528; SSE41-NEXT: pmuludq %xmm3, %xmm6 529; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 530; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 531; SSE41-NEXT: pxor %xmm8, %xmm8 532; SSE41-NEXT: pcmpeqd %xmm8, %xmm1 533; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 534; SSE41-NEXT: pxor %xmm6, %xmm1 535; SSE41-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero 536; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 537; SSE41-NEXT: pmuludq %xmm7, %xmm3 538; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 539; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] 540; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 541; SSE41-NEXT: pxor %xmm6, %xmm4 542; SSE41-NEXT: pmulld %xmm2, %xmm0 543; SSE41-NEXT: movq %xmm5, 16(%rcx) 544; SSE41-NEXT: movdqa %xmm0, (%rcx) 545; SSE41-NEXT: movq %xmm4, 16(%rdi) 546; SSE41-NEXT: movdqa %xmm1, (%rdi) 547; SSE41-NEXT: retq 548; 549; AVX1-LABEL: umulo_v6i32: 550; AVX1: # %bb.0: 551; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 552; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 553; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 554; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 555; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 556; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 557; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 558; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] 559; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 560; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2 561; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 562; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 563; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 564; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 565; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 566; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7 567; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 568; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 569; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 570; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 571; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 572; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 573; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 574; AVX1-NEXT: vmovq %xmm1, 16(%rdi) 575; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 576; AVX1-NEXT: vmovaps %ymm2, %ymm0 577; AVX1-NEXT: retq 578; 579; AVX2-LABEL: umulo_v6i32: 580; AVX2: # %bb.0: 581; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 582; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 583; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 584; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 585; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] 586; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 587; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 588; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 589; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 590; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 591; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 592; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 593; AVX2-NEXT: vmovq %xmm1, 16(%rdi) 594; AVX2-NEXT: vmovdqa %xmm0, (%rdi) 595; AVX2-NEXT: vmovdqa %ymm2, %ymm0 596; AVX2-NEXT: retq 597; 598; AVX512-LABEL: umulo_v6i32: 599; AVX512: # %bb.0: 600; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 601; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] 602; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] 603; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 604; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] 605; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 606; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 607; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 608; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 609; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 610; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 611; AVX512-NEXT: vmovq %xmm2, 16(%rdi) 612; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 613; AVX512-NEXT: retq 614 %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) 615 %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 616 %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 617 %res = sext <6 x i1> %obit to <6 x i32> 618 store <6 x i32> %val, <6 x i32>* %p2 619 ret <6 x i32> %res 620} 621 622define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { 623; SSE2-LABEL: umulo_v8i32: 624; SSE2: # %bb.0: 625; SSE2-NEXT: movdqa %xmm0, %xmm4 626; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 627; SSE2-NEXT: pmuludq %xmm2, %xmm4 628; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] 629; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 630; SSE2-NEXT: pmuludq %xmm5, %xmm6 631; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 632; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 633; SSE2-NEXT: pxor %xmm8, %xmm8 634; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 635; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 636; SSE2-NEXT: pxor %xmm7, %xmm0 637; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 638; SSE2-NEXT: pmuludq %xmm3, %xmm1 639; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 640; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 641; SSE2-NEXT: pmuludq %xmm5, %xmm3 642; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] 643; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 644; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 645; SSE2-NEXT: pxor %xmm7, %xmm2 646; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 647; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] 648; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 649; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 650; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 651; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 652; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 653; SSE2-NEXT: movdqa %xmm4, (%rdi) 654; SSE2-NEXT: movdqa %xmm2, %xmm1 655; SSE2-NEXT: retq 656; 657; SSSE3-LABEL: umulo_v8i32: 658; SSSE3: # %bb.0: 659; SSSE3-NEXT: movdqa %xmm0, %xmm4 660; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 661; SSSE3-NEXT: pmuludq %xmm2, %xmm4 662; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] 663; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 664; SSSE3-NEXT: pmuludq %xmm5, %xmm6 665; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 666; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 667; SSSE3-NEXT: pxor %xmm8, %xmm8 668; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 669; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 670; SSSE3-NEXT: pxor %xmm7, %xmm0 671; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 672; SSSE3-NEXT: pmuludq %xmm3, %xmm1 673; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 674; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 675; SSSE3-NEXT: pmuludq %xmm5, %xmm3 676; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] 677; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 678; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 679; SSSE3-NEXT: pxor %xmm7, %xmm2 680; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 681; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] 682; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 683; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 684; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 685; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 686; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 687; SSSE3-NEXT: movdqa %xmm4, (%rdi) 688; SSSE3-NEXT: movdqa %xmm2, %xmm1 689; SSSE3-NEXT: retq 690; 691; SSE41-LABEL: umulo_v8i32: 692; SSE41: # %bb.0: 693; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 694; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 695; SSE41-NEXT: pmuludq %xmm4, %xmm5 696; SSE41-NEXT: movdqa %xmm0, %xmm4 697; SSE41-NEXT: pmuludq %xmm2, %xmm4 698; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 699; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 700; SSE41-NEXT: pxor %xmm8, %xmm8 701; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 702; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 703; SSE41-NEXT: pxor %xmm7, %xmm4 704; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 705; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] 706; SSE41-NEXT: pmuludq %xmm5, %xmm6 707; SSE41-NEXT: movdqa %xmm1, %xmm5 708; SSE41-NEXT: pmuludq %xmm3, %xmm5 709; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 710; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 711; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 712; SSE41-NEXT: pxor %xmm7, %xmm5 713; SSE41-NEXT: pmulld %xmm2, %xmm0 714; SSE41-NEXT: pmulld %xmm3, %xmm1 715; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 716; SSE41-NEXT: movdqa %xmm0, (%rdi) 717; SSE41-NEXT: movdqa %xmm4, %xmm0 718; SSE41-NEXT: movdqa %xmm5, %xmm1 719; SSE41-NEXT: retq 720; 721; AVX1-LABEL: umulo_v8i32: 722; AVX1: # %bb.0: 723; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 724; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 725; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 726; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 727; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 728; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 729; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 730; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] 731; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 732; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2 733; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 734; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 735; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 736; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 737; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 738; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7 739; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 740; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 741; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 742; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 743; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 744; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 745; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 746; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) 747; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 748; AVX1-NEXT: vmovaps %ymm2, %ymm0 749; AVX1-NEXT: retq 750; 751; AVX2-LABEL: umulo_v8i32: 752; AVX2: # %bb.0: 753; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 754; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 755; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 756; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 757; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] 758; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 759; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 760; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 761; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 762; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 763; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 764; AVX2-NEXT: vmovdqa %ymm0, (%rdi) 765; AVX2-NEXT: vmovdqa %ymm2, %ymm0 766; AVX2-NEXT: retq 767; 768; AVX512-LABEL: umulo_v8i32: 769; AVX512: # %bb.0: 770; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 771; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] 772; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] 773; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 774; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] 775; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 776; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 777; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 778; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 779; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 780; AVX512-NEXT: vmovdqa %ymm1, (%rdi) 781; AVX512-NEXT: retq 782 %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) 783 %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 784 %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 785 %res = sext <8 x i1> %obit to <8 x i32> 786 store <8 x i32> %val, <8 x i32>* %p2 787 ret <8 x i32> %res 788} 789 790define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { 791; SSE2-LABEL: umulo_v16i32: 792; SSE2: # %bb.0: 793; SSE2-NEXT: movdqa %xmm0, %xmm8 794; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] 795; SSE2-NEXT: pmuludq %xmm4, %xmm8 796; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3] 797; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] 798; SSE2-NEXT: pmuludq %xmm10, %xmm9 799; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3] 800; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 801; SSE2-NEXT: pxor %xmm10, %xmm10 802; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 803; SSE2-NEXT: pcmpeqd %xmm11, %xmm11 804; SSE2-NEXT: pxor %xmm11, %xmm0 805; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] 806; SSE2-NEXT: pmuludq %xmm5, %xmm1 807; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3] 808; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] 809; SSE2-NEXT: pmuludq %xmm13, %xmm12 810; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] 811; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] 812; SSE2-NEXT: pcmpeqd %xmm10, %xmm15 813; SSE2-NEXT: pxor %xmm11, %xmm15 814; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] 815; SSE2-NEXT: pmuludq %xmm6, %xmm2 816; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] 817; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] 818; SSE2-NEXT: pmuludq %xmm14, %xmm13 819; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] 820; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 821; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 822; SSE2-NEXT: pxor %xmm11, %xmm5 823; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 824; SSE2-NEXT: pmuludq %xmm7, %xmm3 825; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 826; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 827; SSE2-NEXT: pmuludq %xmm14, %xmm7 828; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3] 829; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 830; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 831; SSE2-NEXT: pxor %xmm11, %xmm6 832; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 833; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 834; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 835; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 836; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] 837; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 838; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 839; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] 840; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 841; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 842; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] 843; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 844; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 845; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 846; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 847; SSE2-NEXT: movdqa %xmm8, (%rdi) 848; SSE2-NEXT: movdqa %xmm15, %xmm1 849; SSE2-NEXT: movdqa %xmm5, %xmm2 850; SSE2-NEXT: movdqa %xmm6, %xmm3 851; SSE2-NEXT: retq 852; 853; SSSE3-LABEL: umulo_v16i32: 854; SSSE3: # %bb.0: 855; SSSE3-NEXT: movdqa %xmm0, %xmm8 856; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] 857; SSSE3-NEXT: pmuludq %xmm4, %xmm8 858; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3] 859; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] 860; SSSE3-NEXT: pmuludq %xmm10, %xmm9 861; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3] 862; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 863; SSSE3-NEXT: pxor %xmm10, %xmm10 864; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 865; SSSE3-NEXT: pcmpeqd %xmm11, %xmm11 866; SSSE3-NEXT: pxor %xmm11, %xmm0 867; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] 868; SSSE3-NEXT: pmuludq %xmm5, %xmm1 869; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3] 870; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] 871; SSSE3-NEXT: pmuludq %xmm13, %xmm12 872; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] 873; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] 874; SSSE3-NEXT: pcmpeqd %xmm10, %xmm15 875; SSSE3-NEXT: pxor %xmm11, %xmm15 876; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] 877; SSSE3-NEXT: pmuludq %xmm6, %xmm2 878; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] 879; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] 880; SSSE3-NEXT: pmuludq %xmm14, %xmm13 881; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] 882; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 883; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 884; SSSE3-NEXT: pxor %xmm11, %xmm5 885; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 886; SSSE3-NEXT: pmuludq %xmm7, %xmm3 887; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 888; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 889; SSSE3-NEXT: pmuludq %xmm14, %xmm7 890; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3] 891; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 892; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6 893; SSSE3-NEXT: pxor %xmm11, %xmm6 894; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 895; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 896; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 897; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 898; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] 899; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 900; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 901; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] 902; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 903; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 904; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] 905; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 906; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) 907; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) 908; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 909; SSSE3-NEXT: movdqa %xmm8, (%rdi) 910; SSSE3-NEXT: movdqa %xmm15, %xmm1 911; SSSE3-NEXT: movdqa %xmm5, %xmm2 912; SSSE3-NEXT: movdqa %xmm6, %xmm3 913; SSSE3-NEXT: retq 914; 915; SSE41-LABEL: umulo_v16i32: 916; SSE41: # %bb.0: 917; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] 918; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] 919; SSE41-NEXT: pmuludq %xmm8, %xmm9 920; SSE41-NEXT: movdqa %xmm0, %xmm8 921; SSE41-NEXT: pmuludq %xmm4, %xmm8 922; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] 923; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] 924; SSE41-NEXT: pxor %xmm12, %xmm12 925; SSE41-NEXT: pcmpeqd %xmm12, %xmm8 926; SSE41-NEXT: pcmpeqd %xmm13, %xmm13 927; SSE41-NEXT: pxor %xmm13, %xmm8 928; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] 929; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] 930; SSE41-NEXT: pmuludq %xmm9, %xmm10 931; SSE41-NEXT: movdqa %xmm1, %xmm9 932; SSE41-NEXT: pmuludq %xmm5, %xmm9 933; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] 934; SSE41-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5],xmm10[6,7] 935; SSE41-NEXT: pcmpeqd %xmm12, %xmm9 936; SSE41-NEXT: pxor %xmm13, %xmm9 937; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] 938; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] 939; SSE41-NEXT: pmuludq %xmm10, %xmm11 940; SSE41-NEXT: movdqa %xmm2, %xmm10 941; SSE41-NEXT: pmuludq %xmm6, %xmm10 942; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] 943; SSE41-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] 944; SSE41-NEXT: pcmpeqd %xmm12, %xmm10 945; SSE41-NEXT: pxor %xmm13, %xmm10 946; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[1,1,3,3] 947; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 948; SSE41-NEXT: pmuludq %xmm11, %xmm14 949; SSE41-NEXT: movdqa %xmm3, %xmm11 950; SSE41-NEXT: pmuludq %xmm7, %xmm11 951; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] 952; SSE41-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3],xmm11[4,5],xmm14[6,7] 953; SSE41-NEXT: pcmpeqd %xmm12, %xmm11 954; SSE41-NEXT: pxor %xmm13, %xmm11 955; SSE41-NEXT: pmulld %xmm4, %xmm0 956; SSE41-NEXT: pmulld %xmm5, %xmm1 957; SSE41-NEXT: pmulld %xmm6, %xmm2 958; SSE41-NEXT: pmulld %xmm7, %xmm3 959; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 960; SSE41-NEXT: movdqa %xmm2, 32(%rdi) 961; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 962; SSE41-NEXT: movdqa %xmm0, (%rdi) 963; SSE41-NEXT: movdqa %xmm8, %xmm0 964; SSE41-NEXT: movdqa %xmm9, %xmm1 965; SSE41-NEXT: movdqa %xmm10, %xmm2 966; SSE41-NEXT: movdqa %xmm11, %xmm3 967; SSE41-NEXT: retq 968; 969; AVX1-LABEL: umulo_v16i32: 970; AVX1: # %bb.0: 971; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 972; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] 973; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 974; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,1,3,3] 975; AVX1-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 976; AVX1-NEXT: vpmuludq %xmm10, %xmm12, %xmm7 977; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 978; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] 979; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 980; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm7 981; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 982; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7 983; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] 984; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 985; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4 986; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm6 987; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 988; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5],xmm4[6,7] 989; AVX1-NEXT: vpcmpeqd %xmm4, %xmm8, %xmm4 990; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm4 991; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm11 992; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 993; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] 994; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 995; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 996; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 997; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm7 998; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 999; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 1000; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 1001; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm13 1002; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 1003; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 1004; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 1005; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm7 1006; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 1007; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 1008; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 1009; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 1010; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5 1011; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5 1012; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 1013; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 1014; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 1015; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6 1016; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 1017; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] 1018; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1019; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1020; AVX1-NEXT: vpacksswb %xmm11, %xmm11, %xmm1 1021; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 1022; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 1023; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1024; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 1025; AVX1-NEXT: vmovdqa %xmm6, 48(%rdi) 1026; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) 1027; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) 1028; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 1029; AVX1-NEXT: retq 1030; 1031; AVX2-LABEL: umulo_v16i32: 1032; AVX2: # %bb.0: 1033; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] 1034; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7] 1035; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm4 1036; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm5 1037; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7] 1038; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] 1039; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 1040; AVX2-NEXT: vpcmpeqd %ymm5, %ymm4, %ymm4 1041; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 1042; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4 1043; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 1044; AVX2-NEXT: vpackssdw %xmm7, %xmm4, %xmm4 1045; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,3,3,5,5,7,7] 1046; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[1,1,3,3,5,5,7,7] 1047; AVX2-NEXT: vpmuludq %ymm7, %ymm8, %ymm7 1048; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm8 1049; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7] 1050; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] 1051; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 1052; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 1053; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 1054; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 1055; AVX2-NEXT: vpacksswb %xmm5, %xmm5, %xmm5 1056; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2 1057; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3 1058; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0 1059; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1 1060; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1061; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) 1062; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 1063; AVX2-NEXT: retq 1064; 1065; AVX512-LABEL: umulo_v16i32: 1066; AVX512: # %bb.0: 1067; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 1068; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 1069; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 1070; AVX512-NEXT: vpmuludq %zmm3, %zmm4, %zmm3 1071; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] 1072; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1073; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k1 1074; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1 1075; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1076; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) 1077; AVX512-NEXT: retq 1078 %t = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) 1079 %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 1080 %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 1081 %res = sext <16 x i1> %obit to <16 x i32> 1082 store <16 x i32> %val, <16 x i32>* %p2 1083 ret <16 x i32> %res 1084} 1085 1086define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { 1087; SSE2-LABEL: umulo_v16i8: 1088; SSE2: # %bb.0: 1089; SSE2-NEXT: movdqa %xmm1, %xmm2 1090; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1091; SSE2-NEXT: movdqa %xmm0, %xmm3 1092; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1093; SSE2-NEXT: pmullw %xmm2, %xmm3 1094; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1095; SSE2-NEXT: pand %xmm2, %xmm3 1096; SSE2-NEXT: movdqa %xmm1, %xmm5 1097; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1098; SSE2-NEXT: movdqa %xmm0, %xmm4 1099; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1100; SSE2-NEXT: pmullw %xmm5, %xmm4 1101; SSE2-NEXT: pand %xmm2, %xmm4 1102; SSE2-NEXT: packuswb %xmm3, %xmm4 1103; SSE2-NEXT: pxor %xmm2, %xmm2 1104; SSE2-NEXT: movdqa %xmm1, %xmm3 1105; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1106; SSE2-NEXT: movdqa %xmm0, %xmm5 1107; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] 1108; SSE2-NEXT: pmullw %xmm3, %xmm5 1109; SSE2-NEXT: psrlw $8, %xmm5 1110; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1111; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1112; SSE2-NEXT: pmullw %xmm1, %xmm0 1113; SSE2-NEXT: psrlw $8, %xmm0 1114; SSE2-NEXT: packuswb %xmm5, %xmm0 1115; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 1116; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 1117; SSE2-NEXT: pxor %xmm0, %xmm3 1118; SSE2-NEXT: movdqa %xmm3, %xmm1 1119; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1120; SSE2-NEXT: movdqa %xmm1, %xmm0 1121; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1122; SSE2-NEXT: pslld $31, %xmm0 1123; SSE2-NEXT: psrad $31, %xmm0 1124; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1125; SSE2-NEXT: pslld $31, %xmm1 1126; SSE2-NEXT: psrad $31, %xmm1 1127; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1128; SSE2-NEXT: movdqa %xmm3, %xmm2 1129; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1130; SSE2-NEXT: pslld $31, %xmm2 1131; SSE2-NEXT: psrad $31, %xmm2 1132; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1133; SSE2-NEXT: pslld $31, %xmm3 1134; SSE2-NEXT: psrad $31, %xmm3 1135; SSE2-NEXT: movdqa %xmm4, (%rdi) 1136; SSE2-NEXT: retq 1137; 1138; SSSE3-LABEL: umulo_v16i8: 1139; SSSE3: # %bb.0: 1140; SSSE3-NEXT: movdqa %xmm1, %xmm2 1141; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1142; SSSE3-NEXT: movdqa %xmm0, %xmm3 1143; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1144; SSSE3-NEXT: pmullw %xmm2, %xmm3 1145; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1146; SSSE3-NEXT: pand %xmm2, %xmm3 1147; SSSE3-NEXT: movdqa %xmm1, %xmm5 1148; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1149; SSSE3-NEXT: movdqa %xmm0, %xmm4 1150; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1151; SSSE3-NEXT: pmullw %xmm5, %xmm4 1152; SSSE3-NEXT: pand %xmm2, %xmm4 1153; SSSE3-NEXT: packuswb %xmm3, %xmm4 1154; SSSE3-NEXT: pxor %xmm2, %xmm2 1155; SSSE3-NEXT: movdqa %xmm1, %xmm3 1156; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1157; SSSE3-NEXT: movdqa %xmm0, %xmm5 1158; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] 1159; SSSE3-NEXT: pmullw %xmm3, %xmm5 1160; SSSE3-NEXT: psrlw $8, %xmm5 1161; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1162; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1163; SSSE3-NEXT: pmullw %xmm1, %xmm0 1164; SSSE3-NEXT: psrlw $8, %xmm0 1165; SSSE3-NEXT: packuswb %xmm5, %xmm0 1166; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 1167; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 1168; SSSE3-NEXT: pxor %xmm0, %xmm3 1169; SSSE3-NEXT: movdqa %xmm3, %xmm1 1170; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1171; SSSE3-NEXT: movdqa %xmm1, %xmm0 1172; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1173; SSSE3-NEXT: pslld $31, %xmm0 1174; SSSE3-NEXT: psrad $31, %xmm0 1175; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1176; SSSE3-NEXT: pslld $31, %xmm1 1177; SSSE3-NEXT: psrad $31, %xmm1 1178; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1179; SSSE3-NEXT: movdqa %xmm3, %xmm2 1180; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1181; SSSE3-NEXT: pslld $31, %xmm2 1182; SSSE3-NEXT: psrad $31, %xmm2 1183; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1184; SSSE3-NEXT: pslld $31, %xmm3 1185; SSSE3-NEXT: psrad $31, %xmm3 1186; SSSE3-NEXT: movdqa %xmm4, (%rdi) 1187; SSSE3-NEXT: retq 1188; 1189; SSE41-LABEL: umulo_v16i8: 1190; SSE41: # %bb.0: 1191; SSE41-NEXT: movdqa %xmm1, %xmm2 1192; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1193; SSE41-NEXT: movdqa %xmm0, %xmm3 1194; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1195; SSE41-NEXT: pmullw %xmm2, %xmm3 1196; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1197; SSE41-NEXT: pand %xmm4, %xmm3 1198; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1199; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1200; SSE41-NEXT: pmullw %xmm2, %xmm5 1201; SSE41-NEXT: pand %xmm5, %xmm4 1202; SSE41-NEXT: packuswb %xmm3, %xmm4 1203; SSE41-NEXT: pxor %xmm2, %xmm2 1204; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1205; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1206; SSE41-NEXT: pmullw %xmm1, %xmm0 1207; SSE41-NEXT: psrlw $8, %xmm0 1208; SSE41-NEXT: psrlw $8, %xmm5 1209; SSE41-NEXT: packuswb %xmm0, %xmm5 1210; SSE41-NEXT: pcmpeqb %xmm2, %xmm5 1211; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 1212; SSE41-NEXT: pxor %xmm5, %xmm3 1213; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1214; SSE41-NEXT: pslld $31, %xmm0 1215; SSE41-NEXT: psrad $31, %xmm0 1216; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 1217; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1218; SSE41-NEXT: pslld $31, %xmm1 1219; SSE41-NEXT: psrad $31, %xmm1 1220; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1221; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 1222; SSE41-NEXT: pslld $31, %xmm2 1223; SSE41-NEXT: psrad $31, %xmm2 1224; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 1225; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1226; SSE41-NEXT: pslld $31, %xmm3 1227; SSE41-NEXT: psrad $31, %xmm3 1228; SSE41-NEXT: movdqa %xmm4, (%rdi) 1229; SSE41-NEXT: retq 1230; 1231; AVX1-LABEL: umulo_v16i8: 1232; AVX1: # %bb.0: 1233; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1234; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1235; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 1236; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1237; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1238; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1239; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1240; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 1241; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 1242; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 1243; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1244; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 1245; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 1246; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1247; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1248; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1 1249; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 1250; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 1251; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1252; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 1253; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 1254; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] 1255; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1256; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1257; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 1258; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1259; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1260; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1261; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 1262; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 1263; AVX1-NEXT: retq 1264; 1265; AVX2-LABEL: umulo_v16i8: 1266; AVX2: # %bb.0: 1267; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1268; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1269; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1270; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm1 1271; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1272; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm2 1273; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1274; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1275; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1276; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1277; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 1278; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1279; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 1280; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 1281; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1282; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1283; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 1284; AVX2-NEXT: retq 1285; 1286; AVX512-LABEL: umulo_v16i8: 1287; AVX512: # %bb.0: 1288; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1289; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1290; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm1 1291; AVX512-NEXT: vpsrlw $8, %ymm1, %ymm0 1292; AVX512-NEXT: vpmovwb %ymm0, %xmm0 1293; AVX512-NEXT: vptestmb %xmm0, %xmm0, %k1 1294; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1295; AVX512-NEXT: vpmovwb %ymm1, (%rdi) 1296; AVX512-NEXT: retq 1297 %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) 1298 %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 1299 %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 1300 %res = sext <16 x i1> %obit to <16 x i32> 1301 store <16 x i8> %val, <16 x i8>* %p2 1302 ret <16 x i32> %res 1303} 1304 1305define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nounwind { 1306; SSE2-LABEL: umulo_v32i8: 1307; SSE2: # %bb.0: 1308; SSE2-NEXT: movq %rdi, %rax 1309; SSE2-NEXT: movdqa %xmm2, %xmm4 1310; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1311; SSE2-NEXT: movdqa %xmm0, %xmm5 1312; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1313; SSE2-NEXT: pmullw %xmm4, %xmm5 1314; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 1315; SSE2-NEXT: pand %xmm8, %xmm5 1316; SSE2-NEXT: movdqa %xmm2, %xmm7 1317; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1318; SSE2-NEXT: movdqa %xmm0, %xmm9 1319; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1320; SSE2-NEXT: pmullw %xmm7, %xmm9 1321; SSE2-NEXT: pand %xmm8, %xmm9 1322; SSE2-NEXT: packuswb %xmm5, %xmm9 1323; SSE2-NEXT: movdqa %xmm3, %xmm5 1324; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1325; SSE2-NEXT: movdqa %xmm1, %xmm7 1326; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1327; SSE2-NEXT: pmullw %xmm5, %xmm7 1328; SSE2-NEXT: pand %xmm8, %xmm7 1329; SSE2-NEXT: movdqa %xmm3, %xmm6 1330; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1331; SSE2-NEXT: movdqa %xmm1, %xmm10 1332; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1333; SSE2-NEXT: pmullw %xmm6, %xmm10 1334; SSE2-NEXT: pand %xmm8, %xmm10 1335; SSE2-NEXT: packuswb %xmm7, %xmm10 1336; SSE2-NEXT: pxor %xmm6, %xmm6 1337; SSE2-NEXT: movdqa %xmm3, %xmm7 1338; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 1339; SSE2-NEXT: movdqa %xmm1, %xmm4 1340; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 1341; SSE2-NEXT: pmullw %xmm7, %xmm4 1342; SSE2-NEXT: psrlw $8, %xmm4 1343; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 1344; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 1345; SSE2-NEXT: pmullw %xmm3, %xmm1 1346; SSE2-NEXT: psrlw $8, %xmm1 1347; SSE2-NEXT: packuswb %xmm4, %xmm1 1348; SSE2-NEXT: pcmpeqb %xmm6, %xmm1 1349; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 1350; SSE2-NEXT: pxor %xmm3, %xmm1 1351; SSE2-NEXT: movdqa %xmm2, %xmm4 1352; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 1353; SSE2-NEXT: movdqa %xmm0, %xmm7 1354; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 1355; SSE2-NEXT: pmullw %xmm4, %xmm7 1356; SSE2-NEXT: psrlw $8, %xmm7 1357; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 1358; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 1359; SSE2-NEXT: pmullw %xmm2, %xmm0 1360; SSE2-NEXT: psrlw $8, %xmm0 1361; SSE2-NEXT: packuswb %xmm7, %xmm0 1362; SSE2-NEXT: pcmpeqb %xmm6, %xmm0 1363; SSE2-NEXT: pxor %xmm3, %xmm0 1364; SSE2-NEXT: movdqa %xmm0, %xmm2 1365; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1366; SSE2-NEXT: movdqa %xmm2, %xmm3 1367; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1368; SSE2-NEXT: pslld $31, %xmm3 1369; SSE2-NEXT: psrad $31, %xmm3 1370; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1371; SSE2-NEXT: pslld $31, %xmm2 1372; SSE2-NEXT: psrad $31, %xmm2 1373; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1374; SSE2-NEXT: movdqa %xmm0, %xmm6 1375; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] 1376; SSE2-NEXT: pslld $31, %xmm6 1377; SSE2-NEXT: psrad $31, %xmm6 1378; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1379; SSE2-NEXT: pslld $31, %xmm0 1380; SSE2-NEXT: psrad $31, %xmm0 1381; SSE2-NEXT: movdqa %xmm1, %xmm4 1382; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1383; SSE2-NEXT: movdqa %xmm4, %xmm7 1384; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] 1385; SSE2-NEXT: pslld $31, %xmm7 1386; SSE2-NEXT: psrad $31, %xmm7 1387; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] 1388; SSE2-NEXT: pslld $31, %xmm4 1389; SSE2-NEXT: psrad $31, %xmm4 1390; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1391; SSE2-NEXT: movdqa %xmm1, %xmm5 1392; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1393; SSE2-NEXT: pslld $31, %xmm5 1394; SSE2-NEXT: psrad $31, %xmm5 1395; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1396; SSE2-NEXT: pslld $31, %xmm1 1397; SSE2-NEXT: psrad $31, %xmm1 1398; SSE2-NEXT: movdqa %xmm10, 16(%rsi) 1399; SSE2-NEXT: movdqa %xmm9, (%rsi) 1400; SSE2-NEXT: movdqa %xmm1, 112(%rdi) 1401; SSE2-NEXT: movdqa %xmm5, 96(%rdi) 1402; SSE2-NEXT: movdqa %xmm4, 80(%rdi) 1403; SSE2-NEXT: movdqa %xmm7, 64(%rdi) 1404; SSE2-NEXT: movdqa %xmm0, 48(%rdi) 1405; SSE2-NEXT: movdqa %xmm6, 32(%rdi) 1406; SSE2-NEXT: movdqa %xmm2, 16(%rdi) 1407; SSE2-NEXT: movdqa %xmm3, (%rdi) 1408; SSE2-NEXT: retq 1409; 1410; SSSE3-LABEL: umulo_v32i8: 1411; SSSE3: # %bb.0: 1412; SSSE3-NEXT: movq %rdi, %rax 1413; SSSE3-NEXT: movdqa %xmm2, %xmm4 1414; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1415; SSSE3-NEXT: movdqa %xmm0, %xmm5 1416; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1417; SSSE3-NEXT: pmullw %xmm4, %xmm5 1418; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 1419; SSSE3-NEXT: pand %xmm8, %xmm5 1420; SSSE3-NEXT: movdqa %xmm2, %xmm7 1421; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1422; SSSE3-NEXT: movdqa %xmm0, %xmm9 1423; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1424; SSSE3-NEXT: pmullw %xmm7, %xmm9 1425; SSSE3-NEXT: pand %xmm8, %xmm9 1426; SSSE3-NEXT: packuswb %xmm5, %xmm9 1427; SSSE3-NEXT: movdqa %xmm3, %xmm5 1428; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1429; SSSE3-NEXT: movdqa %xmm1, %xmm7 1430; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1431; SSSE3-NEXT: pmullw %xmm5, %xmm7 1432; SSSE3-NEXT: pand %xmm8, %xmm7 1433; SSSE3-NEXT: movdqa %xmm3, %xmm6 1434; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1435; SSSE3-NEXT: movdqa %xmm1, %xmm10 1436; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1437; SSSE3-NEXT: pmullw %xmm6, %xmm10 1438; SSSE3-NEXT: pand %xmm8, %xmm10 1439; SSSE3-NEXT: packuswb %xmm7, %xmm10 1440; SSSE3-NEXT: pxor %xmm6, %xmm6 1441; SSSE3-NEXT: movdqa %xmm3, %xmm7 1442; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 1443; SSSE3-NEXT: movdqa %xmm1, %xmm4 1444; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 1445; SSSE3-NEXT: pmullw %xmm7, %xmm4 1446; SSSE3-NEXT: psrlw $8, %xmm4 1447; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 1448; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 1449; SSSE3-NEXT: pmullw %xmm3, %xmm1 1450; SSSE3-NEXT: psrlw $8, %xmm1 1451; SSSE3-NEXT: packuswb %xmm4, %xmm1 1452; SSSE3-NEXT: pcmpeqb %xmm6, %xmm1 1453; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 1454; SSSE3-NEXT: pxor %xmm3, %xmm1 1455; SSSE3-NEXT: movdqa %xmm2, %xmm4 1456; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 1457; SSSE3-NEXT: movdqa %xmm0, %xmm7 1458; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] 1459; SSSE3-NEXT: pmullw %xmm4, %xmm7 1460; SSSE3-NEXT: psrlw $8, %xmm7 1461; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 1462; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 1463; SSSE3-NEXT: pmullw %xmm2, %xmm0 1464; SSSE3-NEXT: psrlw $8, %xmm0 1465; SSSE3-NEXT: packuswb %xmm7, %xmm0 1466; SSSE3-NEXT: pcmpeqb %xmm6, %xmm0 1467; SSSE3-NEXT: pxor %xmm3, %xmm0 1468; SSSE3-NEXT: movdqa %xmm0, %xmm2 1469; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1470; SSSE3-NEXT: movdqa %xmm2, %xmm3 1471; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1472; SSSE3-NEXT: pslld $31, %xmm3 1473; SSSE3-NEXT: psrad $31, %xmm3 1474; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1475; SSSE3-NEXT: pslld $31, %xmm2 1476; SSSE3-NEXT: psrad $31, %xmm2 1477; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1478; SSSE3-NEXT: movdqa %xmm0, %xmm6 1479; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] 1480; SSSE3-NEXT: pslld $31, %xmm6 1481; SSSE3-NEXT: psrad $31, %xmm6 1482; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1483; SSSE3-NEXT: pslld $31, %xmm0 1484; SSSE3-NEXT: psrad $31, %xmm0 1485; SSSE3-NEXT: movdqa %xmm1, %xmm4 1486; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1487; SSSE3-NEXT: movdqa %xmm4, %xmm7 1488; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] 1489; SSSE3-NEXT: pslld $31, %xmm7 1490; SSSE3-NEXT: psrad $31, %xmm7 1491; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] 1492; SSSE3-NEXT: pslld $31, %xmm4 1493; SSSE3-NEXT: psrad $31, %xmm4 1494; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1495; SSSE3-NEXT: movdqa %xmm1, %xmm5 1496; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1497; SSSE3-NEXT: pslld $31, %xmm5 1498; SSSE3-NEXT: psrad $31, %xmm5 1499; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1500; SSSE3-NEXT: pslld $31, %xmm1 1501; SSSE3-NEXT: psrad $31, %xmm1 1502; SSSE3-NEXT: movdqa %xmm10, 16(%rsi) 1503; SSSE3-NEXT: movdqa %xmm9, (%rsi) 1504; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) 1505; SSSE3-NEXT: movdqa %xmm5, 96(%rdi) 1506; SSSE3-NEXT: movdqa %xmm4, 80(%rdi) 1507; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) 1508; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) 1509; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) 1510; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) 1511; SSSE3-NEXT: movdqa %xmm3, (%rdi) 1512; SSSE3-NEXT: retq 1513; 1514; SSE41-LABEL: umulo_v32i8: 1515; SSE41: # %bb.0: 1516; SSE41-NEXT: movq %rdi, %rax 1517; SSE41-NEXT: movdqa %xmm2, %xmm4 1518; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1519; SSE41-NEXT: movdqa %xmm0, %xmm7 1520; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1521; SSE41-NEXT: pmullw %xmm4, %xmm7 1522; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] 1523; SSE41-NEXT: pand %xmm9, %xmm7 1524; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1525; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1526; SSE41-NEXT: pmullw %xmm5, %xmm6 1527; SSE41-NEXT: movdqa %xmm6, %xmm8 1528; SSE41-NEXT: pand %xmm9, %xmm8 1529; SSE41-NEXT: packuswb %xmm7, %xmm8 1530; SSE41-NEXT: movdqa %xmm3, %xmm7 1531; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1532; SSE41-NEXT: movdqa %xmm1, %xmm5 1533; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1534; SSE41-NEXT: pmullw %xmm7, %xmm5 1535; SSE41-NEXT: pand %xmm9, %xmm5 1536; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1537; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1538; SSE41-NEXT: pmullw %xmm4, %xmm7 1539; SSE41-NEXT: pand %xmm7, %xmm9 1540; SSE41-NEXT: packuswb %xmm5, %xmm9 1541; SSE41-NEXT: pxor %xmm4, %xmm4 1542; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 1543; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 1544; SSE41-NEXT: pmullw %xmm3, %xmm1 1545; SSE41-NEXT: psrlw $8, %xmm1 1546; SSE41-NEXT: psrlw $8, %xmm7 1547; SSE41-NEXT: packuswb %xmm1, %xmm7 1548; SSE41-NEXT: pcmpeqb %xmm4, %xmm7 1549; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 1550; SSE41-NEXT: pxor %xmm1, %xmm7 1551; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 1552; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] 1553; SSE41-NEXT: pmullw %xmm2, %xmm0 1554; SSE41-NEXT: psrlw $8, %xmm0 1555; SSE41-NEXT: psrlw $8, %xmm6 1556; SSE41-NEXT: packuswb %xmm0, %xmm6 1557; SSE41-NEXT: pcmpeqb %xmm4, %xmm6 1558; SSE41-NEXT: pxor %xmm1, %xmm6 1559; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 1560; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1561; SSE41-NEXT: pslld $31, %xmm0 1562; SSE41-NEXT: psrad $31, %xmm0 1563; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] 1564; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1565; SSE41-NEXT: pslld $31, %xmm1 1566; SSE41-NEXT: psrad $31, %xmm1 1567; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] 1568; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 1569; SSE41-NEXT: pslld $31, %xmm2 1570; SSE41-NEXT: psrad $31, %xmm2 1571; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] 1572; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1573; SSE41-NEXT: pslld $31, %xmm3 1574; SSE41-NEXT: psrad $31, %xmm3 1575; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] 1576; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 1577; SSE41-NEXT: pslld $31, %xmm4 1578; SSE41-NEXT: psrad $31, %xmm4 1579; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] 1580; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 1581; SSE41-NEXT: pslld $31, %xmm5 1582; SSE41-NEXT: psrad $31, %xmm5 1583; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero 1584; SSE41-NEXT: pslld $31, %xmm6 1585; SSE41-NEXT: psrad $31, %xmm6 1586; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 1587; SSE41-NEXT: pslld $31, %xmm7 1588; SSE41-NEXT: psrad $31, %xmm7 1589; SSE41-NEXT: movdqa %xmm9, 16(%rsi) 1590; SSE41-NEXT: movdqa %xmm8, (%rsi) 1591; SSE41-NEXT: movdqa %xmm7, 64(%rdi) 1592; SSE41-NEXT: movdqa %xmm6, (%rdi) 1593; SSE41-NEXT: movdqa %xmm5, 112(%rdi) 1594; SSE41-NEXT: movdqa %xmm4, 96(%rdi) 1595; SSE41-NEXT: movdqa %xmm3, 80(%rdi) 1596; SSE41-NEXT: movdqa %xmm2, 48(%rdi) 1597; SSE41-NEXT: movdqa %xmm1, 32(%rdi) 1598; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 1599; SSE41-NEXT: retq 1600; 1601; AVX1-LABEL: umulo_v32i8: 1602; AVX1: # %bb.0: 1603; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1604; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1605; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 1606; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1607; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1608; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1609; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1610; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm9 1611; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm4 1612; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm8 1613; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1614; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1615; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 1616; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1617; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 1618; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 1619; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1620; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero 1621; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm6 1622; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 1623; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm5 1624; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1625; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 1626; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] 1627; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 1628; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1629; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm4 1630; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 1631; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 1632; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1633; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 1634; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 1635; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 1636; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1637; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1638; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm1 1639; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 1640; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 1641; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm1 1642; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 1643; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1644; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1645; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1646; AVX1-NEXT: vpmovsxbd %xmm6, %xmm2 1647; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] 1648; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1649; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1650; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 1651; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1652; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1653; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1654; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 1655; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] 1656; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1657; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] 1658; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 1659; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 1660; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) 1661; AVX1-NEXT: vmovdqa %xmm8, (%rdi) 1662; AVX1-NEXT: retq 1663; 1664; AVX2-LABEL: umulo_v32i8: 1665; AVX2: # %bb.0: 1666; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1667; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1668; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 1669; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1670; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1671; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1672; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1673; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 1674; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3 1675; AVX2-NEXT: vpackuswb %ymm2, %ymm3, %ymm4 1676; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1677; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 1678; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1679; AVX2-NEXT: vpmullw %ymm3, %ymm5, %ymm3 1680; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 1681; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 1682; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1683; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1684; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1685; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 1686; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 1687; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1688; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 1689; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 1690; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1691; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2 1692; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1693; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1694; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 1695; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 1696; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1697; AVX2-NEXT: retq 1698; 1699; AVX512-LABEL: umulo_v32i8: 1700; AVX512: # %bb.0: 1701; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 1702; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1703; AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm2 1704; AVX512-NEXT: vpsrlw $8, %zmm2, %zmm0 1705; AVX512-NEXT: vpmovwb %zmm0, %ymm0 1706; AVX512-NEXT: vptestmb %ymm0, %ymm0, %k1 1707; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1708; AVX512-NEXT: kshiftrd $16, %k1, %k1 1709; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 1710; AVX512-NEXT: vpmovwb %zmm2, (%rdi) 1711; AVX512-NEXT: retq 1712 %t = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> %a0, <32 x i8> %a1) 1713 %val = extractvalue {<32 x i8>, <32 x i1>} %t, 0 1714 %obit = extractvalue {<32 x i8>, <32 x i1>} %t, 1 1715 %res = sext <32 x i1> %obit to <32 x i32> 1716 store <32 x i8> %val, <32 x i8>* %p2 1717 ret <32 x i32> %res 1718} 1719 1720define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nounwind { 1721; SSE2-LABEL: umulo_v64i8: 1722; SSE2: # %bb.0: 1723; SSE2-NEXT: movq %rdi, %rax 1724; SSE2-NEXT: movdqa %xmm4, %xmm8 1725; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1726; SSE2-NEXT: movdqa %xmm0, %xmm9 1727; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1728; SSE2-NEXT: pmullw %xmm8, %xmm9 1729; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255] 1730; SSE2-NEXT: pand %xmm12, %xmm9 1731; SSE2-NEXT: movdqa %xmm4, %xmm10 1732; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1733; SSE2-NEXT: movdqa %xmm0, %xmm8 1734; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1735; SSE2-NEXT: pmullw %xmm10, %xmm8 1736; SSE2-NEXT: pand %xmm12, %xmm8 1737; SSE2-NEXT: packuswb %xmm9, %xmm8 1738; SSE2-NEXT: movdqa %xmm5, %xmm9 1739; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1740; SSE2-NEXT: movdqa %xmm1, %xmm10 1741; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1742; SSE2-NEXT: pmullw %xmm9, %xmm10 1743; SSE2-NEXT: pand %xmm12, %xmm10 1744; SSE2-NEXT: movdqa %xmm5, %xmm11 1745; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1746; SSE2-NEXT: movdqa %xmm1, %xmm9 1747; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1748; SSE2-NEXT: pmullw %xmm11, %xmm9 1749; SSE2-NEXT: pand %xmm12, %xmm9 1750; SSE2-NEXT: packuswb %xmm10, %xmm9 1751; SSE2-NEXT: movdqa %xmm6, %xmm10 1752; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1753; SSE2-NEXT: movdqa %xmm2, %xmm11 1754; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1755; SSE2-NEXT: pmullw %xmm10, %xmm11 1756; SSE2-NEXT: pand %xmm12, %xmm11 1757; SSE2-NEXT: movdqa %xmm6, %xmm13 1758; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1759; SSE2-NEXT: movdqa %xmm2, %xmm10 1760; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1761; SSE2-NEXT: pmullw %xmm13, %xmm10 1762; SSE2-NEXT: pand %xmm12, %xmm10 1763; SSE2-NEXT: packuswb %xmm11, %xmm10 1764; SSE2-NEXT: movdqa %xmm7, %xmm11 1765; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1766; SSE2-NEXT: movdqa %xmm3, %xmm13 1767; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1768; SSE2-NEXT: pmullw %xmm11, %xmm13 1769; SSE2-NEXT: pand %xmm12, %xmm13 1770; SSE2-NEXT: movdqa %xmm7, %xmm14 1771; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1772; SSE2-NEXT: movdqa %xmm3, %xmm11 1773; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1774; SSE2-NEXT: pmullw %xmm14, %xmm11 1775; SSE2-NEXT: pand %xmm12, %xmm11 1776; SSE2-NEXT: packuswb %xmm13, %xmm11 1777; SSE2-NEXT: pxor %xmm12, %xmm12 1778; SSE2-NEXT: movdqa %xmm7, %xmm13 1779; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 1780; SSE2-NEXT: movdqa %xmm3, %xmm14 1781; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] 1782; SSE2-NEXT: pmullw %xmm13, %xmm14 1783; SSE2-NEXT: psrlw $8, %xmm14 1784; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] 1785; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 1786; SSE2-NEXT: pmullw %xmm7, %xmm3 1787; SSE2-NEXT: psrlw $8, %xmm3 1788; SSE2-NEXT: packuswb %xmm14, %xmm3 1789; SSE2-NEXT: movdqa %xmm6, %xmm13 1790; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 1791; SSE2-NEXT: movdqa %xmm2, %xmm7 1792; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] 1793; SSE2-NEXT: pmullw %xmm13, %xmm7 1794; SSE2-NEXT: psrlw $8, %xmm7 1795; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] 1796; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] 1797; SSE2-NEXT: pmullw %xmm6, %xmm2 1798; SSE2-NEXT: psrlw $8, %xmm2 1799; SSE2-NEXT: packuswb %xmm7, %xmm2 1800; SSE2-NEXT: movdqa %xmm5, %xmm6 1801; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] 1802; SSE2-NEXT: movdqa %xmm1, %xmm7 1803; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] 1804; SSE2-NEXT: pmullw %xmm6, %xmm7 1805; SSE2-NEXT: psrlw $8, %xmm7 1806; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 1807; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] 1808; SSE2-NEXT: pmullw %xmm5, %xmm1 1809; SSE2-NEXT: psrlw $8, %xmm1 1810; SSE2-NEXT: packuswb %xmm7, %xmm1 1811; SSE2-NEXT: movdqa %xmm4, %xmm5 1812; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm12[8],xmm5[9],xmm12[9],xmm5[10],xmm12[10],xmm5[11],xmm12[11],xmm5[12],xmm12[12],xmm5[13],xmm12[13],xmm5[14],xmm12[14],xmm5[15],xmm12[15] 1813; SSE2-NEXT: movdqa %xmm0, %xmm6 1814; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] 1815; SSE2-NEXT: pmullw %xmm5, %xmm6 1816; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 1817; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 1818; SSE2-NEXT: pmullw %xmm4, %xmm0 1819; SSE2-NEXT: psrlw $8, %xmm6 1820; SSE2-NEXT: psrlw $8, %xmm0 1821; SSE2-NEXT: packuswb %xmm6, %xmm0 1822; SSE2-NEXT: pcmpeqb %xmm12, %xmm3 1823; SSE2-NEXT: pcmpeqb %xmm12, %xmm2 1824; SSE2-NEXT: pcmpeqb %xmm12, %xmm1 1825; SSE2-NEXT: pcmpeqb %xmm12, %xmm0 1826; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1827; SSE2-NEXT: pxor %xmm4, %xmm3 1828; SSE2-NEXT: pxor %xmm4, %xmm2 1829; SSE2-NEXT: pxor %xmm4, %xmm1 1830; SSE2-NEXT: pxor %xmm4, %xmm0 1831; SSE2-NEXT: movdqa %xmm11, 48(%rsi) 1832; SSE2-NEXT: movdqa %xmm10, 32(%rsi) 1833; SSE2-NEXT: movdqa %xmm9, 16(%rsi) 1834; SSE2-NEXT: movdqa %xmm3, %xmm4 1835; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1836; SSE2-NEXT: movdqa %xmm8, (%rsi) 1837; SSE2-NEXT: movdqa %xmm3, %xmm5 1838; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1839; SSE2-NEXT: pslld $31, %xmm3 1840; SSE2-NEXT: psrad $31, %xmm3 1841; SSE2-NEXT: movdqa %xmm3, 224(%rdi) 1842; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1843; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1844; SSE2-NEXT: pslld $31, %xmm5 1845; SSE2-NEXT: psrad $31, %xmm5 1846; SSE2-NEXT: movdqa %xmm5, 240(%rdi) 1847; SSE2-NEXT: movdqa %xmm4, %xmm3 1848; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1849; SSE2-NEXT: pslld $31, %xmm4 1850; SSE2-NEXT: psrad $31, %xmm4 1851; SSE2-NEXT: movdqa %xmm4, 192(%rdi) 1852; SSE2-NEXT: movdqa %xmm2, %xmm4 1853; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1854; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1855; SSE2-NEXT: pslld $31, %xmm3 1856; SSE2-NEXT: psrad $31, %xmm3 1857; SSE2-NEXT: movdqa %xmm3, 208(%rdi) 1858; SSE2-NEXT: movdqa %xmm2, %xmm3 1859; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1860; SSE2-NEXT: pslld $31, %xmm2 1861; SSE2-NEXT: psrad $31, %xmm2 1862; SSE2-NEXT: movdqa %xmm2, 160(%rdi) 1863; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1864; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1865; SSE2-NEXT: pslld $31, %xmm3 1866; SSE2-NEXT: psrad $31, %xmm3 1867; SSE2-NEXT: movdqa %xmm3, 176(%rdi) 1868; SSE2-NEXT: movdqa %xmm4, %xmm2 1869; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1870; SSE2-NEXT: pslld $31, %xmm4 1871; SSE2-NEXT: psrad $31, %xmm4 1872; SSE2-NEXT: movdqa %xmm4, 128(%rdi) 1873; SSE2-NEXT: movdqa %xmm1, %xmm3 1874; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1875; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1876; SSE2-NEXT: pslld $31, %xmm2 1877; SSE2-NEXT: psrad $31, %xmm2 1878; SSE2-NEXT: movdqa %xmm2, 144(%rdi) 1879; SSE2-NEXT: movdqa %xmm1, %xmm2 1880; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1881; SSE2-NEXT: pslld $31, %xmm1 1882; SSE2-NEXT: psrad $31, %xmm1 1883; SSE2-NEXT: movdqa %xmm1, 96(%rdi) 1884; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1885; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1886; SSE2-NEXT: pslld $31, %xmm2 1887; SSE2-NEXT: psrad $31, %xmm2 1888; SSE2-NEXT: movdqa %xmm2, 112(%rdi) 1889; SSE2-NEXT: movdqa %xmm3, %xmm1 1890; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1891; SSE2-NEXT: pslld $31, %xmm3 1892; SSE2-NEXT: psrad $31, %xmm3 1893; SSE2-NEXT: movdqa %xmm3, 64(%rdi) 1894; SSE2-NEXT: movdqa %xmm0, %xmm2 1895; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1896; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1897; SSE2-NEXT: pslld $31, %xmm1 1898; SSE2-NEXT: psrad $31, %xmm1 1899; SSE2-NEXT: movdqa %xmm1, 80(%rdi) 1900; SSE2-NEXT: movdqa %xmm0, %xmm1 1901; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1902; SSE2-NEXT: pslld $31, %xmm0 1903; SSE2-NEXT: psrad $31, %xmm0 1904; SSE2-NEXT: movdqa %xmm0, 32(%rdi) 1905; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1906; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1907; SSE2-NEXT: pslld $31, %xmm1 1908; SSE2-NEXT: psrad $31, %xmm1 1909; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1910; SSE2-NEXT: movdqa %xmm2, %xmm0 1911; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1912; SSE2-NEXT: pslld $31, %xmm2 1913; SSE2-NEXT: psrad $31, %xmm2 1914; SSE2-NEXT: movdqa %xmm2, (%rdi) 1915; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1916; SSE2-NEXT: pslld $31, %xmm0 1917; SSE2-NEXT: psrad $31, %xmm0 1918; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1919; SSE2-NEXT: retq 1920; 1921; SSSE3-LABEL: umulo_v64i8: 1922; SSSE3: # %bb.0: 1923; SSSE3-NEXT: movq %rdi, %rax 1924; SSSE3-NEXT: movdqa %xmm4, %xmm8 1925; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1926; SSSE3-NEXT: movdqa %xmm0, %xmm9 1927; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1928; SSSE3-NEXT: pmullw %xmm8, %xmm9 1929; SSSE3-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255] 1930; SSSE3-NEXT: pand %xmm12, %xmm9 1931; SSSE3-NEXT: movdqa %xmm4, %xmm10 1932; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1933; SSSE3-NEXT: movdqa %xmm0, %xmm8 1934; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1935; SSSE3-NEXT: pmullw %xmm10, %xmm8 1936; SSSE3-NEXT: pand %xmm12, %xmm8 1937; SSSE3-NEXT: packuswb %xmm9, %xmm8 1938; SSSE3-NEXT: movdqa %xmm5, %xmm9 1939; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1940; SSSE3-NEXT: movdqa %xmm1, %xmm10 1941; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1942; SSSE3-NEXT: pmullw %xmm9, %xmm10 1943; SSSE3-NEXT: pand %xmm12, %xmm10 1944; SSSE3-NEXT: movdqa %xmm5, %xmm11 1945; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1946; SSSE3-NEXT: movdqa %xmm1, %xmm9 1947; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1948; SSSE3-NEXT: pmullw %xmm11, %xmm9 1949; SSSE3-NEXT: pand %xmm12, %xmm9 1950; SSSE3-NEXT: packuswb %xmm10, %xmm9 1951; SSSE3-NEXT: movdqa %xmm6, %xmm10 1952; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1953; SSSE3-NEXT: movdqa %xmm2, %xmm11 1954; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1955; SSSE3-NEXT: pmullw %xmm10, %xmm11 1956; SSSE3-NEXT: pand %xmm12, %xmm11 1957; SSSE3-NEXT: movdqa %xmm6, %xmm13 1958; SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1959; SSSE3-NEXT: movdqa %xmm2, %xmm10 1960; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1961; SSSE3-NEXT: pmullw %xmm13, %xmm10 1962; SSSE3-NEXT: pand %xmm12, %xmm10 1963; SSSE3-NEXT: packuswb %xmm11, %xmm10 1964; SSSE3-NEXT: movdqa %xmm7, %xmm11 1965; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1966; SSSE3-NEXT: movdqa %xmm3, %xmm13 1967; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1968; SSSE3-NEXT: pmullw %xmm11, %xmm13 1969; SSSE3-NEXT: pand %xmm12, %xmm13 1970; SSSE3-NEXT: movdqa %xmm7, %xmm14 1971; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1972; SSSE3-NEXT: movdqa %xmm3, %xmm11 1973; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1974; SSSE3-NEXT: pmullw %xmm14, %xmm11 1975; SSSE3-NEXT: pand %xmm12, %xmm11 1976; SSSE3-NEXT: packuswb %xmm13, %xmm11 1977; SSSE3-NEXT: pxor %xmm12, %xmm12 1978; SSSE3-NEXT: movdqa %xmm7, %xmm13 1979; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 1980; SSSE3-NEXT: movdqa %xmm3, %xmm14 1981; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] 1982; SSSE3-NEXT: pmullw %xmm13, %xmm14 1983; SSSE3-NEXT: psrlw $8, %xmm14 1984; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] 1985; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 1986; SSSE3-NEXT: pmullw %xmm7, %xmm3 1987; SSSE3-NEXT: psrlw $8, %xmm3 1988; SSSE3-NEXT: packuswb %xmm14, %xmm3 1989; SSSE3-NEXT: movdqa %xmm6, %xmm13 1990; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] 1991; SSSE3-NEXT: movdqa %xmm2, %xmm7 1992; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] 1993; SSSE3-NEXT: pmullw %xmm13, %xmm7 1994; SSSE3-NEXT: psrlw $8, %xmm7 1995; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] 1996; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] 1997; SSSE3-NEXT: pmullw %xmm6, %xmm2 1998; SSSE3-NEXT: psrlw $8, %xmm2 1999; SSSE3-NEXT: packuswb %xmm7, %xmm2 2000; SSSE3-NEXT: movdqa %xmm5, %xmm6 2001; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] 2002; SSSE3-NEXT: movdqa %xmm1, %xmm7 2003; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] 2004; SSSE3-NEXT: pmullw %xmm6, %xmm7 2005; SSSE3-NEXT: psrlw $8, %xmm7 2006; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] 2007; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] 2008; SSSE3-NEXT: pmullw %xmm5, %xmm1 2009; SSSE3-NEXT: psrlw $8, %xmm1 2010; SSSE3-NEXT: packuswb %xmm7, %xmm1 2011; SSSE3-NEXT: movdqa %xmm4, %xmm5 2012; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm12[8],xmm5[9],xmm12[9],xmm5[10],xmm12[10],xmm5[11],xmm12[11],xmm5[12],xmm12[12],xmm5[13],xmm12[13],xmm5[14],xmm12[14],xmm5[15],xmm12[15] 2013; SSSE3-NEXT: movdqa %xmm0, %xmm6 2014; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] 2015; SSSE3-NEXT: pmullw %xmm5, %xmm6 2016; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 2017; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 2018; SSSE3-NEXT: pmullw %xmm4, %xmm0 2019; SSSE3-NEXT: psrlw $8, %xmm6 2020; SSSE3-NEXT: psrlw $8, %xmm0 2021; SSSE3-NEXT: packuswb %xmm6, %xmm0 2022; SSSE3-NEXT: pcmpeqb %xmm12, %xmm3 2023; SSSE3-NEXT: pcmpeqb %xmm12, %xmm2 2024; SSSE3-NEXT: pcmpeqb %xmm12, %xmm1 2025; SSSE3-NEXT: pcmpeqb %xmm12, %xmm0 2026; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 2027; SSSE3-NEXT: pxor %xmm4, %xmm3 2028; SSSE3-NEXT: pxor %xmm4, %xmm2 2029; SSSE3-NEXT: pxor %xmm4, %xmm1 2030; SSSE3-NEXT: pxor %xmm4, %xmm0 2031; SSSE3-NEXT: movdqa %xmm11, 48(%rsi) 2032; SSSE3-NEXT: movdqa %xmm10, 32(%rsi) 2033; SSSE3-NEXT: movdqa %xmm9, 16(%rsi) 2034; SSSE3-NEXT: movdqa %xmm3, %xmm4 2035; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2036; SSSE3-NEXT: movdqa %xmm8, (%rsi) 2037; SSSE3-NEXT: movdqa %xmm3, %xmm5 2038; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 2039; SSSE3-NEXT: pslld $31, %xmm3 2040; SSSE3-NEXT: psrad $31, %xmm3 2041; SSSE3-NEXT: movdqa %xmm3, 224(%rdi) 2042; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2043; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 2044; SSSE3-NEXT: pslld $31, %xmm5 2045; SSSE3-NEXT: psrad $31, %xmm5 2046; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) 2047; SSSE3-NEXT: movdqa %xmm4, %xmm3 2048; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 2049; SSSE3-NEXT: pslld $31, %xmm4 2050; SSSE3-NEXT: psrad $31, %xmm4 2051; SSSE3-NEXT: movdqa %xmm4, 192(%rdi) 2052; SSSE3-NEXT: movdqa %xmm2, %xmm4 2053; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2054; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 2055; SSSE3-NEXT: pslld $31, %xmm3 2056; SSSE3-NEXT: psrad $31, %xmm3 2057; SSSE3-NEXT: movdqa %xmm3, 208(%rdi) 2058; SSSE3-NEXT: movdqa %xmm2, %xmm3 2059; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 2060; SSSE3-NEXT: pslld $31, %xmm2 2061; SSSE3-NEXT: psrad $31, %xmm2 2062; SSSE3-NEXT: movdqa %xmm2, 160(%rdi) 2063; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2064; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 2065; SSSE3-NEXT: pslld $31, %xmm3 2066; SSSE3-NEXT: psrad $31, %xmm3 2067; SSSE3-NEXT: movdqa %xmm3, 176(%rdi) 2068; SSSE3-NEXT: movdqa %xmm4, %xmm2 2069; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 2070; SSSE3-NEXT: pslld $31, %xmm4 2071; SSSE3-NEXT: psrad $31, %xmm4 2072; SSSE3-NEXT: movdqa %xmm4, 128(%rdi) 2073; SSSE3-NEXT: movdqa %xmm1, %xmm3 2074; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2075; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 2076; SSSE3-NEXT: pslld $31, %xmm2 2077; SSSE3-NEXT: psrad $31, %xmm2 2078; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) 2079; SSSE3-NEXT: movdqa %xmm1, %xmm2 2080; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 2081; SSSE3-NEXT: pslld $31, %xmm1 2082; SSSE3-NEXT: psrad $31, %xmm1 2083; SSSE3-NEXT: movdqa %xmm1, 96(%rdi) 2084; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2085; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 2086; SSSE3-NEXT: pslld $31, %xmm2 2087; SSSE3-NEXT: psrad $31, %xmm2 2088; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) 2089; SSSE3-NEXT: movdqa %xmm3, %xmm1 2090; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 2091; SSSE3-NEXT: pslld $31, %xmm3 2092; SSSE3-NEXT: psrad $31, %xmm3 2093; SSSE3-NEXT: movdqa %xmm3, 64(%rdi) 2094; SSSE3-NEXT: movdqa %xmm0, %xmm2 2095; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2096; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2097; SSSE3-NEXT: pslld $31, %xmm1 2098; SSSE3-NEXT: psrad $31, %xmm1 2099; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) 2100; SSSE3-NEXT: movdqa %xmm0, %xmm1 2101; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2102; SSSE3-NEXT: pslld $31, %xmm0 2103; SSSE3-NEXT: psrad $31, %xmm0 2104; SSSE3-NEXT: movdqa %xmm0, 32(%rdi) 2105; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2106; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2107; SSSE3-NEXT: pslld $31, %xmm1 2108; SSSE3-NEXT: psrad $31, %xmm1 2109; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) 2110; SSSE3-NEXT: movdqa %xmm2, %xmm0 2111; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 2112; SSSE3-NEXT: pslld $31, %xmm2 2113; SSSE3-NEXT: psrad $31, %xmm2 2114; SSSE3-NEXT: movdqa %xmm2, (%rdi) 2115; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 2116; SSSE3-NEXT: pslld $31, %xmm0 2117; SSSE3-NEXT: psrad $31, %xmm0 2118; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) 2119; SSSE3-NEXT: retq 2120; 2121; SSE41-LABEL: umulo_v64i8: 2122; SSE41: # %bb.0: 2123; SSE41-NEXT: movdqa %xmm5, %xmm11 2124; SSE41-NEXT: movdqa %xmm4, %xmm13 2125; SSE41-NEXT: movdqa %xmm3, %xmm4 2126; SSE41-NEXT: movdqa %xmm0, %xmm5 2127; SSE41-NEXT: movq %rdi, %rax 2128; SSE41-NEXT: movdqa %xmm13, %xmm8 2129; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2130; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2131; SSE41-NEXT: pmullw %xmm8, %xmm0 2132; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] 2133; SSE41-NEXT: pand %xmm10, %xmm0 2134; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero 2135; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2136; SSE41-NEXT: pmullw %xmm9, %xmm8 2137; SSE41-NEXT: movdqa %xmm8, %xmm3 2138; SSE41-NEXT: pand %xmm10, %xmm3 2139; SSE41-NEXT: packuswb %xmm0, %xmm3 2140; SSE41-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2141; SSE41-NEXT: movdqa %xmm11, %xmm9 2142; SSE41-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2143; SSE41-NEXT: movdqa %xmm1, %xmm0 2144; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2145; SSE41-NEXT: pmullw %xmm9, %xmm0 2146; SSE41-NEXT: pand %xmm10, %xmm0 2147; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero 2148; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2149; SSE41-NEXT: pmullw %xmm12, %xmm9 2150; SSE41-NEXT: movdqa %xmm9, %xmm3 2151; SSE41-NEXT: pand %xmm10, %xmm3 2152; SSE41-NEXT: packuswb %xmm0, %xmm3 2153; SSE41-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2154; SSE41-NEXT: movdqa %xmm6, %xmm12 2155; SSE41-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2156; SSE41-NEXT: movdqa %xmm2, %xmm0 2157; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2158; SSE41-NEXT: pmullw %xmm12, %xmm0 2159; SSE41-NEXT: pand %xmm10, %xmm0 2160; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2161; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2162; SSE41-NEXT: pmullw %xmm14, %xmm12 2163; SSE41-NEXT: movdqa %xmm12, %xmm15 2164; SSE41-NEXT: pand %xmm10, %xmm15 2165; SSE41-NEXT: packuswb %xmm0, %xmm15 2166; SSE41-NEXT: movdqa %xmm7, %xmm0 2167; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2168; SSE41-NEXT: movdqa %xmm4, %xmm3 2169; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2170; SSE41-NEXT: pmullw %xmm0, %xmm3 2171; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero 2172; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 2173; SSE41-NEXT: pmullw %xmm0, %xmm14 2174; SSE41-NEXT: pand %xmm10, %xmm3 2175; SSE41-NEXT: pand %xmm14, %xmm10 2176; SSE41-NEXT: packuswb %xmm3, %xmm10 2177; SSE41-NEXT: pxor %xmm0, %xmm0 2178; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] 2179; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2180; SSE41-NEXT: pmullw %xmm7, %xmm4 2181; SSE41-NEXT: psrlw $8, %xmm4 2182; SSE41-NEXT: psrlw $8, %xmm14 2183; SSE41-NEXT: packuswb %xmm4, %xmm14 2184; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] 2185; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2186; SSE41-NEXT: pmullw %xmm6, %xmm2 2187; SSE41-NEXT: psrlw $8, %xmm2 2188; SSE41-NEXT: psrlw $8, %xmm12 2189; SSE41-NEXT: packuswb %xmm2, %xmm12 2190; SSE41-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] 2191; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2192; SSE41-NEXT: pmullw %xmm11, %xmm1 2193; SSE41-NEXT: psrlw $8, %xmm1 2194; SSE41-NEXT: psrlw $8, %xmm9 2195; SSE41-NEXT: packuswb %xmm1, %xmm9 2196; SSE41-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm0[8],xmm13[9],xmm0[9],xmm13[10],xmm0[10],xmm13[11],xmm0[11],xmm13[12],xmm0[12],xmm13[13],xmm0[13],xmm13[14],xmm0[14],xmm13[15],xmm0[15] 2197; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2198; SSE41-NEXT: pmullw %xmm13, %xmm5 2199; SSE41-NEXT: psrlw $8, %xmm5 2200; SSE41-NEXT: psrlw $8, %xmm8 2201; SSE41-NEXT: packuswb %xmm5, %xmm8 2202; SSE41-NEXT: pcmpeqb %xmm0, %xmm14 2203; SSE41-NEXT: pcmpeqb %xmm0, %xmm12 2204; SSE41-NEXT: pcmpeqb %xmm0, %xmm9 2205; SSE41-NEXT: pcmpeqb %xmm0, %xmm8 2206; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 2207; SSE41-NEXT: pxor %xmm0, %xmm14 2208; SSE41-NEXT: pxor %xmm0, %xmm12 2209; SSE41-NEXT: pxor %xmm0, %xmm9 2210; SSE41-NEXT: pxor %xmm0, %xmm8 2211; SSE41-NEXT: movdqa %xmm10, 48(%rsi) 2212; SSE41-NEXT: movdqa %xmm15, 32(%rsi) 2213; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2214; SSE41-NEXT: movaps %xmm0, 16(%rsi) 2215; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2216; SSE41-NEXT: movaps %xmm0, (%rsi) 2217; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero 2218; SSE41-NEXT: pslld $31, %xmm0 2219; SSE41-NEXT: psrad $31, %xmm0 2220; SSE41-NEXT: movdqa %xmm0, 192(%rdi) 2221; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero 2222; SSE41-NEXT: pslld $31, %xmm0 2223; SSE41-NEXT: psrad $31, %xmm0 2224; SSE41-NEXT: movdqa %xmm0, 128(%rdi) 2225; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero 2226; SSE41-NEXT: pslld $31, %xmm0 2227; SSE41-NEXT: psrad $31, %xmm0 2228; SSE41-NEXT: movdqa %xmm0, 64(%rdi) 2229; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 2230; SSE41-NEXT: pslld $31, %xmm0 2231; SSE41-NEXT: psrad $31, %xmm0 2232; SSE41-NEXT: movdqa %xmm0, (%rdi) 2233; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] 2234; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2235; SSE41-NEXT: pslld $31, %xmm0 2236; SSE41-NEXT: psrad $31, %xmm0 2237; SSE41-NEXT: movdqa %xmm0, 224(%rdi) 2238; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] 2239; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2240; SSE41-NEXT: pslld $31, %xmm0 2241; SSE41-NEXT: psrad $31, %xmm0 2242; SSE41-NEXT: movdqa %xmm0, 240(%rdi) 2243; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] 2244; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2245; SSE41-NEXT: pslld $31, %xmm0 2246; SSE41-NEXT: psrad $31, %xmm0 2247; SSE41-NEXT: movdqa %xmm0, 208(%rdi) 2248; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] 2249; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2250; SSE41-NEXT: pslld $31, %xmm0 2251; SSE41-NEXT: psrad $31, %xmm0 2252; SSE41-NEXT: movdqa %xmm0, 160(%rdi) 2253; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] 2254; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2255; SSE41-NEXT: pslld $31, %xmm0 2256; SSE41-NEXT: psrad $31, %xmm0 2257; SSE41-NEXT: movdqa %xmm0, 176(%rdi) 2258; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] 2259; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2260; SSE41-NEXT: pslld $31, %xmm0 2261; SSE41-NEXT: psrad $31, %xmm0 2262; SSE41-NEXT: movdqa %xmm0, 144(%rdi) 2263; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 2264; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2265; SSE41-NEXT: pslld $31, %xmm0 2266; SSE41-NEXT: psrad $31, %xmm0 2267; SSE41-NEXT: movdqa %xmm0, 96(%rdi) 2268; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] 2269; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2270; SSE41-NEXT: pslld $31, %xmm0 2271; SSE41-NEXT: psrad $31, %xmm0 2272; SSE41-NEXT: movdqa %xmm0, 112(%rdi) 2273; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 2274; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2275; SSE41-NEXT: pslld $31, %xmm0 2276; SSE41-NEXT: psrad $31, %xmm0 2277; SSE41-NEXT: movdqa %xmm0, 80(%rdi) 2278; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 2279; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2280; SSE41-NEXT: pslld $31, %xmm0 2281; SSE41-NEXT: psrad $31, %xmm0 2282; SSE41-NEXT: movdqa %xmm0, 32(%rdi) 2283; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] 2284; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2285; SSE41-NEXT: pslld $31, %xmm0 2286; SSE41-NEXT: psrad $31, %xmm0 2287; SSE41-NEXT: movdqa %xmm0, 48(%rdi) 2288; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 2289; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2290; SSE41-NEXT: pslld $31, %xmm0 2291; SSE41-NEXT: psrad $31, %xmm0 2292; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 2293; SSE41-NEXT: retq 2294; 2295; AVX1-LABEL: umulo_v64i8: 2296; AVX1: # %bb.0: 2297; AVX1-NEXT: vmovdqa %ymm2, %ymm10 2298; AVX1-NEXT: vmovdqa %ymm0, %ymm2 2299; AVX1-NEXT: movq %rdi, %rax 2300; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2301; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2302; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 2303; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 2304; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 2305; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero 2306; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2307; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm12 2308; AVX1-NEXT: vpand %xmm7, %xmm12, %xmm5 2309; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm0 2310; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2311; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm13 2312; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2313; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm14 2314; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2315; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 2316; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 2317; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero 2318; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero 2319; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm15 2320; AVX1-NEXT: vpand %xmm7, %xmm15, %xmm6 2321; AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm0 2322; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2323; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2324; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2325; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 2326; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 2327; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2328; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2329; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm0 2330; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm6 2331; AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm4 2332; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2333; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 2334; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2335; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm11 2336; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2337; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm8 2338; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2339; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero 2340; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 2341; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 2342; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm7 2343; AVX1-NEXT: vpackuswb %xmm8, %xmm7, %xmm8 2344; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 2345; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] 2346; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] 2347; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 2348; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 2349; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm5 2350; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 2351; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] 2352; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] 2353; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 2354; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2355; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 2356; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2357; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] 2358; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] 2359; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 2360; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2361; AVX1-NEXT: vpsrlw $8, %xmm15, %xmm3 2362; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 2363; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] 2364; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 2365; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2366; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2367; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm3 2368; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 2369; AVX1-NEXT: vpcmpeqb %xmm7, %xmm4, %xmm3 2370; AVX1-NEXT: vpcmpeqb %xmm7, %xmm0, %xmm0 2371; AVX1-NEXT: vpcmpeqb %xmm7, %xmm1, %xmm1 2372; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm4 2373; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 2374; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 2375; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm2 2376; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 2377; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm0 2378; AVX1-NEXT: vmovdqa %xmm8, 48(%rsi) 2379; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2380; AVX1-NEXT: vmovaps %xmm4, 32(%rsi) 2381; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2382; AVX1-NEXT: vmovaps %xmm4, 16(%rsi) 2383; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2384; AVX1-NEXT: vmovaps %xmm4, (%rsi) 2385; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 2386; AVX1-NEXT: vmovdqa %xmm4, 192(%rdi) 2387; AVX1-NEXT: vpmovsxbd %xmm2, %xmm4 2388; AVX1-NEXT: vmovdqa %xmm4, 128(%rdi) 2389; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 2390; AVX1-NEXT: vmovdqa %xmm4, 64(%rdi) 2391; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 2392; AVX1-NEXT: vmovdqa %xmm4, (%rdi) 2393; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] 2394; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 2395; AVX1-NEXT: vmovdqa %xmm4, 224(%rdi) 2396; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] 2397; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 2398; AVX1-NEXT: vmovdqa %xmm4, 240(%rdi) 2399; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] 2400; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 2401; AVX1-NEXT: vmovdqa %xmm3, 208(%rdi) 2402; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 2403; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 2404; AVX1-NEXT: vmovdqa %xmm3, 160(%rdi) 2405; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] 2406; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 2407; AVX1-NEXT: vmovdqa %xmm3, 176(%rdi) 2408; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 2409; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 2410; AVX1-NEXT: vmovdqa %xmm2, 144(%rdi) 2411; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2412; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 2413; AVX1-NEXT: vmovdqa %xmm2, 96(%rdi) 2414; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 2415; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 2416; AVX1-NEXT: vmovdqa %xmm2, 112(%rdi) 2417; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 2418; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 2419; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi) 2420; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2421; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 2422; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) 2423; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 2424; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 2425; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) 2426; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 2427; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2428; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) 2429; AVX1-NEXT: vzeroupper 2430; AVX1-NEXT: retq 2431; 2432; AVX2-LABEL: umulo_v64i8: 2433; AVX2: # %bb.0: 2434; AVX2-NEXT: movq %rdi, %rax 2435; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 2436; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 2437; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 2438; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2439; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 2440; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 2441; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 2442; AVX2-NEXT: vpmullw %ymm6, %ymm7, %ymm6 2443; AVX2-NEXT: vpand %ymm5, %ymm6, %ymm6 2444; AVX2-NEXT: vpackuswb %ymm4, %ymm6, %ymm9 2445; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 2446; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 2447; AVX2-NEXT: vpmullw %ymm6, %ymm7, %ymm6 2448; AVX2-NEXT: vpand %ymm5, %ymm6, %ymm6 2449; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 2450; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 2451; AVX2-NEXT: vpmullw %ymm7, %ymm8, %ymm7 2452; AVX2-NEXT: vpand %ymm5, %ymm7, %ymm5 2453; AVX2-NEXT: vpackuswb %ymm6, %ymm5, %ymm10 2454; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 2455; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11],ymm3[12],ymm6[12],ymm3[13],ymm6[13],ymm3[14],ymm6[14],ymm3[15],ymm6[15],ymm3[24],ymm6[24],ymm3[25],ymm6[25],ymm3[26],ymm6[26],ymm3[27],ymm6[27],ymm3[28],ymm6[28],ymm3[29],ymm6[29],ymm3[30],ymm6[30],ymm3[31],ymm6[31] 2456; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm6[8],ymm1[9],ymm6[9],ymm1[10],ymm6[10],ymm1[11],ymm6[11],ymm1[12],ymm6[12],ymm1[13],ymm6[13],ymm1[14],ymm6[14],ymm1[15],ymm6[15],ymm1[24],ymm6[24],ymm1[25],ymm6[25],ymm1[26],ymm6[26],ymm1[27],ymm6[27],ymm1[28],ymm6[28],ymm1[29],ymm6[29],ymm1[30],ymm6[30],ymm1[31],ymm6[31] 2457; AVX2-NEXT: vpmullw %ymm7, %ymm8, %ymm7 2458; AVX2-NEXT: vpsrlw $8, %ymm7, %ymm7 2459; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] 2460; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[4],ymm6[4],ymm1[5],ymm6[5],ymm1[6],ymm6[6],ymm1[7],ymm6[7],ymm1[16],ymm6[16],ymm1[17],ymm6[17],ymm1[18],ymm6[18],ymm1[19],ymm6[19],ymm1[20],ymm6[20],ymm1[21],ymm6[21],ymm1[22],ymm6[22],ymm1[23],ymm6[23] 2461; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 2462; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2463; AVX2-NEXT: vpackuswb %ymm7, %ymm1, %ymm1 2464; AVX2-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1 2465; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 2466; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 2467; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15],ymm2[24],ymm6[24],ymm2[25],ymm6[25],ymm2[26],ymm6[26],ymm2[27],ymm6[27],ymm2[28],ymm6[28],ymm2[29],ymm6[29],ymm2[30],ymm6[30],ymm2[31],ymm6[31] 2468; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm0[8],ymm6[8],ymm0[9],ymm6[9],ymm0[10],ymm6[10],ymm0[11],ymm6[11],ymm0[12],ymm6[12],ymm0[13],ymm6[13],ymm0[14],ymm6[14],ymm0[15],ymm6[15],ymm0[24],ymm6[24],ymm0[25],ymm6[25],ymm0[26],ymm6[26],ymm0[27],ymm6[27],ymm0[28],ymm6[28],ymm0[29],ymm6[29],ymm0[30],ymm6[30],ymm0[31],ymm6[31] 2469; AVX2-NEXT: vpmullw %ymm7, %ymm8, %ymm7 2470; AVX2-NEXT: vpsrlw $8, %ymm7, %ymm7 2471; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[16],ymm6[16],ymm2[17],ymm6[17],ymm2[18],ymm6[18],ymm2[19],ymm6[19],ymm2[20],ymm6[20],ymm2[21],ymm6[21],ymm2[22],ymm6[22],ymm2[23],ymm6[23] 2472; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] 2473; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 2474; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 2475; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0 2476; AVX2-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 2477; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 2478; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2479; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 2480; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 2481; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 2482; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 2483; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] 2484; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 2485; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 2486; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 2487; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 2488; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 2489; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 2490; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 2491; AVX2-NEXT: vpmovsxbd %xmm4, %ymm4 2492; AVX2-NEXT: vmovdqa %ymm10, 32(%rsi) 2493; AVX2-NEXT: vmovdqa %ymm9, (%rsi) 2494; AVX2-NEXT: vmovdqa %ymm4, 192(%rdi) 2495; AVX2-NEXT: vmovdqa %ymm1, 128(%rdi) 2496; AVX2-NEXT: vmovdqa %ymm3, 64(%rdi) 2497; AVX2-NEXT: vmovdqa %ymm0, (%rdi) 2498; AVX2-NEXT: vmovdqa %ymm5, 224(%rdi) 2499; AVX2-NEXT: vmovdqa %ymm7, 160(%rdi) 2500; AVX2-NEXT: vmovdqa %ymm6, 96(%rdi) 2501; AVX2-NEXT: vmovdqa %ymm2, 32(%rdi) 2502; AVX2-NEXT: vzeroupper 2503; AVX2-NEXT: retq 2504; 2505; AVX512-LABEL: umulo_v64i8: 2506; AVX512: # %bb.0: 2507; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 2508; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 2509; AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2 2510; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2511; AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 2512; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 2513; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm5 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 2514; AVX512-NEXT: vpmullw %zmm4, %zmm5, %zmm4 2515; AVX512-NEXT: vpandq %zmm3, %zmm4, %zmm3 2516; AVX512-NEXT: vpackuswb %zmm2, %zmm3, %zmm4 2517; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 2518; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 2519; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63] 2520; AVX512-NEXT: vpmullw %zmm3, %zmm5, %zmm3 2521; AVX512-NEXT: vpsrlw $8, %zmm3, %zmm3 2522; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 2523; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55] 2524; AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2525; AVX512-NEXT: vpsrlw $8, %zmm0, %zmm0 2526; AVX512-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 2527; AVX512-NEXT: vptestmb %zmm0, %zmm0, %k1 2528; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 2529; AVX512-NEXT: kshiftrd $16, %k1, %k2 2530; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} 2531; AVX512-NEXT: kshiftrq $32, %k1, %k1 2532; AVX512-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 2533; AVX512-NEXT: kshiftrd $16, %k1, %k1 2534; AVX512-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 2535; AVX512-NEXT: vmovdqa64 %zmm4, (%rdi) 2536; AVX512-NEXT: retq 2537 %t = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> %a0, <64 x i8> %a1) 2538 %val = extractvalue {<64 x i8>, <64 x i1>} %t, 0 2539 %obit = extractvalue {<64 x i8>, <64 x i1>} %t, 1 2540 %res = sext <64 x i1> %obit to <64 x i32> 2541 store <64 x i8> %val, <64 x i8>* %p2 2542 ret <64 x i32> %res 2543} 2544 2545define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { 2546; SSE2-LABEL: umulo_v8i16: 2547; SSE2: # %bb.0: 2548; SSE2-NEXT: movdqa %xmm0, %xmm2 2549; SSE2-NEXT: pmullw %xmm1, %xmm2 2550; SSE2-NEXT: pmulhuw %xmm1, %xmm0 2551; SSE2-NEXT: pxor %xmm3, %xmm3 2552; SSE2-NEXT: pcmpeqw %xmm0, %xmm3 2553; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 2554; SSE2-NEXT: pxor %xmm3, %xmm1 2555; SSE2-NEXT: movdqa %xmm1, %xmm0 2556; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2557; SSE2-NEXT: pslld $31, %xmm0 2558; SSE2-NEXT: psrad $31, %xmm0 2559; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2560; SSE2-NEXT: pslld $31, %xmm1 2561; SSE2-NEXT: psrad $31, %xmm1 2562; SSE2-NEXT: movdqa %xmm2, (%rdi) 2563; SSE2-NEXT: retq 2564; 2565; SSSE3-LABEL: umulo_v8i16: 2566; SSSE3: # %bb.0: 2567; SSSE3-NEXT: movdqa %xmm0, %xmm2 2568; SSSE3-NEXT: pmullw %xmm1, %xmm2 2569; SSSE3-NEXT: pmulhuw %xmm1, %xmm0 2570; SSSE3-NEXT: pxor %xmm3, %xmm3 2571; SSSE3-NEXT: pcmpeqw %xmm0, %xmm3 2572; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 2573; SSSE3-NEXT: pxor %xmm3, %xmm1 2574; SSSE3-NEXT: movdqa %xmm1, %xmm0 2575; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2576; SSSE3-NEXT: pslld $31, %xmm0 2577; SSSE3-NEXT: psrad $31, %xmm0 2578; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2579; SSSE3-NEXT: pslld $31, %xmm1 2580; SSSE3-NEXT: psrad $31, %xmm1 2581; SSSE3-NEXT: movdqa %xmm2, (%rdi) 2582; SSSE3-NEXT: retq 2583; 2584; SSE41-LABEL: umulo_v8i16: 2585; SSE41: # %bb.0: 2586; SSE41-NEXT: movdqa %xmm0, %xmm2 2587; SSE41-NEXT: pmullw %xmm1, %xmm2 2588; SSE41-NEXT: pmulhuw %xmm1, %xmm0 2589; SSE41-NEXT: pxor %xmm3, %xmm3 2590; SSE41-NEXT: pcmpeqw %xmm0, %xmm3 2591; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 2592; SSE41-NEXT: pxor %xmm3, %xmm1 2593; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2594; SSE41-NEXT: pslld $31, %xmm0 2595; SSE41-NEXT: psrad $31, %xmm0 2596; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2597; SSE41-NEXT: pslld $31, %xmm1 2598; SSE41-NEXT: psrad $31, %xmm1 2599; SSE41-NEXT: movdqa %xmm2, (%rdi) 2600; SSE41-NEXT: retq 2601; 2602; AVX1-LABEL: umulo_v8i16: 2603; AVX1: # %bb.0: 2604; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2605; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2606; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2607; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2608; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2609; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2610; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 2611; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 2612; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 2613; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2614; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 2615; AVX1-NEXT: retq 2616; 2617; AVX2-LABEL: umulo_v8i16: 2618; AVX2: # %bb.0: 2619; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2620; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2621; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2622; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2623; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2624; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2625; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 2626; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 2627; AVX2-NEXT: retq 2628; 2629; AVX512-LABEL: umulo_v8i16: 2630; AVX512: # %bb.0: 2631; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2632; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2633; AVX512-NEXT: vptestmw %xmm0, %xmm0, %k1 2634; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2635; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2636; AVX512-NEXT: vmovdqa %xmm2, (%rdi) 2637; AVX512-NEXT: retq 2638 %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) 2639 %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 2640 %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 2641 %res = sext <8 x i1> %obit to <8 x i32> 2642 store <8 x i16> %val, <8 x i16>* %p2 2643 ret <8 x i32> %res 2644} 2645 2646define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { 2647; SSE2-LABEL: umulo_v2i64: 2648; SSE2: # %bb.0: 2649; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2650; SSE2-NEXT: movq %xmm2, %r8 2651; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2652; SSE2-NEXT: movq %xmm2, %r10 2653; SSE2-NEXT: movq %xmm0, %rax 2654; SSE2-NEXT: movq %xmm1, %rdx 2655; SSE2-NEXT: xorl %ecx, %ecx 2656; SSE2-NEXT: mulq %rdx 2657; SSE2-NEXT: movq $-1, %r9 2658; SSE2-NEXT: movl $0, %esi 2659; SSE2-NEXT: cmovoq %r9, %rsi 2660; SSE2-NEXT: movq %rax, %xmm1 2661; SSE2-NEXT: movq %r8, %rax 2662; SSE2-NEXT: mulq %r10 2663; SSE2-NEXT: movq %rax, %xmm0 2664; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2665; SSE2-NEXT: movq %rsi, %xmm0 2666; SSE2-NEXT: cmovoq %r9, %rcx 2667; SSE2-NEXT: movq %rcx, %xmm2 2668; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2669; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2670; SSE2-NEXT: movdqa %xmm1, (%rdi) 2671; SSE2-NEXT: retq 2672; 2673; SSSE3-LABEL: umulo_v2i64: 2674; SSSE3: # %bb.0: 2675; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2676; SSSE3-NEXT: movq %xmm2, %r8 2677; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2678; SSSE3-NEXT: movq %xmm2, %r10 2679; SSSE3-NEXT: movq %xmm0, %rax 2680; SSSE3-NEXT: movq %xmm1, %rdx 2681; SSSE3-NEXT: xorl %ecx, %ecx 2682; SSSE3-NEXT: mulq %rdx 2683; SSSE3-NEXT: movq $-1, %r9 2684; SSSE3-NEXT: movl $0, %esi 2685; SSSE3-NEXT: cmovoq %r9, %rsi 2686; SSSE3-NEXT: movq %rax, %xmm1 2687; SSSE3-NEXT: movq %r8, %rax 2688; SSSE3-NEXT: mulq %r10 2689; SSSE3-NEXT: movq %rax, %xmm0 2690; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2691; SSSE3-NEXT: movq %rsi, %xmm0 2692; SSSE3-NEXT: cmovoq %r9, %rcx 2693; SSSE3-NEXT: movq %rcx, %xmm2 2694; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2695; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2696; SSSE3-NEXT: movdqa %xmm1, (%rdi) 2697; SSSE3-NEXT: retq 2698; 2699; SSE41-LABEL: umulo_v2i64: 2700; SSE41: # %bb.0: 2701; SSE41-NEXT: movq %xmm0, %r10 2702; SSE41-NEXT: movq %xmm1, %r8 2703; SSE41-NEXT: pextrq $1, %xmm0, %rax 2704; SSE41-NEXT: pextrq $1, %xmm1, %rdx 2705; SSE41-NEXT: xorl %esi, %esi 2706; SSE41-NEXT: mulq %rdx 2707; SSE41-NEXT: movq $-1, %r9 2708; SSE41-NEXT: movl $0, %ecx 2709; SSE41-NEXT: cmovoq %r9, %rcx 2710; SSE41-NEXT: movq %rax, %xmm0 2711; SSE41-NEXT: movq %r10, %rax 2712; SSE41-NEXT: mulq %r8 2713; SSE41-NEXT: movq %rax, %xmm1 2714; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2715; SSE41-NEXT: movq %rcx, %xmm0 2716; SSE41-NEXT: cmovoq %r9, %rsi 2717; SSE41-NEXT: movq %rsi, %xmm2 2718; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2719; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2720; SSE41-NEXT: movdqa %xmm1, (%rdi) 2721; SSE41-NEXT: retq 2722; 2723; AVX1-LABEL: umulo_v2i64: 2724; AVX1: # %bb.0: 2725; AVX1-NEXT: vmovq %xmm0, %r10 2726; AVX1-NEXT: vmovq %xmm1, %r8 2727; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2728; AVX1-NEXT: vpextrq $1, %xmm1, %rdx 2729; AVX1-NEXT: xorl %esi, %esi 2730; AVX1-NEXT: mulq %rdx 2731; AVX1-NEXT: movq $-1, %r9 2732; AVX1-NEXT: movl $0, %ecx 2733; AVX1-NEXT: cmovoq %r9, %rcx 2734; AVX1-NEXT: vmovq %rax, %xmm0 2735; AVX1-NEXT: movq %r10, %rax 2736; AVX1-NEXT: mulq %r8 2737; AVX1-NEXT: vmovq %rax, %xmm1 2738; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2739; AVX1-NEXT: vmovq %rcx, %xmm0 2740; AVX1-NEXT: cmovoq %r9, %rsi 2741; AVX1-NEXT: vmovq %rsi, %xmm2 2742; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2743; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2744; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 2745; AVX1-NEXT: retq 2746; 2747; AVX2-LABEL: umulo_v2i64: 2748; AVX2: # %bb.0: 2749; AVX2-NEXT: vmovq %xmm0, %r10 2750; AVX2-NEXT: vmovq %xmm1, %r8 2751; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2752; AVX2-NEXT: vpextrq $1, %xmm1, %rdx 2753; AVX2-NEXT: xorl %esi, %esi 2754; AVX2-NEXT: mulq %rdx 2755; AVX2-NEXT: movq $-1, %r9 2756; AVX2-NEXT: movl $0, %ecx 2757; AVX2-NEXT: cmovoq %r9, %rcx 2758; AVX2-NEXT: vmovq %rax, %xmm0 2759; AVX2-NEXT: movq %r10, %rax 2760; AVX2-NEXT: mulq %r8 2761; AVX2-NEXT: vmovq %rax, %xmm1 2762; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2763; AVX2-NEXT: vmovq %rcx, %xmm0 2764; AVX2-NEXT: cmovoq %r9, %rsi 2765; AVX2-NEXT: vmovq %rsi, %xmm2 2766; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2767; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2768; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 2769; AVX2-NEXT: retq 2770; 2771; AVX512-LABEL: umulo_v2i64: 2772; AVX512: # %bb.0: 2773; AVX512-NEXT: vmovq %xmm0, %rcx 2774; AVX512-NEXT: vmovq %xmm1, %rsi 2775; AVX512-NEXT: vpextrq $1, %xmm0, %rax 2776; AVX512-NEXT: vpextrq $1, %xmm1, %rdx 2777; AVX512-NEXT: mulq %rdx 2778; AVX512-NEXT: seto %r8b 2779; AVX512-NEXT: vmovq %rax, %xmm0 2780; AVX512-NEXT: movq %rcx, %rax 2781; AVX512-NEXT: mulq %rsi 2782; AVX512-NEXT: vmovq %rax, %xmm1 2783; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2784; AVX512-NEXT: seto %al 2785; AVX512-NEXT: movw $-3, %cx 2786; AVX512-NEXT: kmovd %ecx, %k0 2787; AVX512-NEXT: kmovd %eax, %k1 2788; AVX512-NEXT: kandw %k0, %k1, %k0 2789; AVX512-NEXT: kmovd %r8d, %k1 2790; AVX512-NEXT: kshiftlw $15, %k1, %k1 2791; AVX512-NEXT: kshiftrw $14, %k1, %k1 2792; AVX512-NEXT: korw %k1, %k0, %k1 2793; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2794; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2795; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 2796; AVX512-NEXT: retq 2797 %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) 2798 %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 2799 %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 2800 %res = sext <2 x i1> %obit to <2 x i32> 2801 store <2 x i64> %val, <2 x i64>* %p2 2802 ret <2 x i32> %res 2803} 2804 2805define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { 2806; SSE2-LABEL: umulo_v4i24: 2807; SSE2: # %bb.0: 2808; SSE2-NEXT: movdqa %xmm0, %xmm2 2809; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2810; SSE2-NEXT: pand %xmm0, %xmm1 2811; SSE2-NEXT: pand %xmm0, %xmm2 2812; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2813; SSE2-NEXT: pmuludq %xmm1, %xmm2 2814; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 2815; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2816; SSE2-NEXT: pmuludq %xmm0, %xmm1 2817; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 2818; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2819; SSE2-NEXT: pxor %xmm4, %xmm4 2820; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 2821; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 2822; SSE2-NEXT: pxor %xmm5, %xmm3 2823; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2824; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] 2825; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 2826; SSE2-NEXT: psrld $24, %xmm0 2827; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 2828; SSE2-NEXT: pxor %xmm5, %xmm0 2829; SSE2-NEXT: por %xmm3, %xmm0 2830; SSE2-NEXT: movd %xmm2, %eax 2831; SSE2-NEXT: movw %ax, (%rdi) 2832; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2833; SSE2-NEXT: movd %xmm2, %ecx 2834; SSE2-NEXT: movw %cx, 6(%rdi) 2835; SSE2-NEXT: movd %xmm1, %edx 2836; SSE2-NEXT: movw %dx, 3(%rdi) 2837; SSE2-NEXT: shrl $16, %eax 2838; SSE2-NEXT: movb %al, 2(%rdi) 2839; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2840; SSE2-NEXT: movd %xmm1, %eax 2841; SSE2-NEXT: movw %ax, 9(%rdi) 2842; SSE2-NEXT: shrl $16, %ecx 2843; SSE2-NEXT: movb %cl, 8(%rdi) 2844; SSE2-NEXT: shrl $16, %edx 2845; SSE2-NEXT: movb %dl, 5(%rdi) 2846; SSE2-NEXT: shrl $16, %eax 2847; SSE2-NEXT: movb %al, 11(%rdi) 2848; SSE2-NEXT: retq 2849; 2850; SSSE3-LABEL: umulo_v4i24: 2851; SSSE3: # %bb.0: 2852; SSSE3-NEXT: movdqa %xmm0, %xmm2 2853; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2854; SSSE3-NEXT: pand %xmm0, %xmm1 2855; SSSE3-NEXT: pand %xmm0, %xmm2 2856; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2857; SSSE3-NEXT: pmuludq %xmm1, %xmm2 2858; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 2859; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2860; SSSE3-NEXT: pmuludq %xmm0, %xmm1 2861; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 2862; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2863; SSSE3-NEXT: pxor %xmm4, %xmm4 2864; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 2865; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 2866; SSSE3-NEXT: pxor %xmm5, %xmm3 2867; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2868; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] 2869; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 2870; SSSE3-NEXT: psrld $24, %xmm0 2871; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 2872; SSSE3-NEXT: pxor %xmm5, %xmm0 2873; SSSE3-NEXT: por %xmm3, %xmm0 2874; SSSE3-NEXT: movd %xmm2, %eax 2875; SSSE3-NEXT: movw %ax, (%rdi) 2876; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2877; SSSE3-NEXT: movd %xmm2, %ecx 2878; SSSE3-NEXT: movw %cx, 6(%rdi) 2879; SSSE3-NEXT: movd %xmm1, %edx 2880; SSSE3-NEXT: movw %dx, 3(%rdi) 2881; SSSE3-NEXT: shrl $16, %eax 2882; SSSE3-NEXT: movb %al, 2(%rdi) 2883; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2884; SSSE3-NEXT: movd %xmm1, %eax 2885; SSSE3-NEXT: movw %ax, 9(%rdi) 2886; SSSE3-NEXT: shrl $16, %ecx 2887; SSSE3-NEXT: movb %cl, 8(%rdi) 2888; SSSE3-NEXT: shrl $16, %edx 2889; SSSE3-NEXT: movb %dl, 5(%rdi) 2890; SSSE3-NEXT: shrl $16, %eax 2891; SSSE3-NEXT: movb %al, 11(%rdi) 2892; SSSE3-NEXT: retq 2893; 2894; SSE41-LABEL: umulo_v4i24: 2895; SSE41: # %bb.0: 2896; SSE41-NEXT: movdqa %xmm0, %xmm2 2897; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2898; SSE41-NEXT: pand %xmm0, %xmm2 2899; SSE41-NEXT: pand %xmm0, %xmm1 2900; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] 2901; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2902; SSE41-NEXT: pmuludq %xmm0, %xmm3 2903; SSE41-NEXT: movdqa %xmm2, %xmm0 2904; SSE41-NEXT: pmuludq %xmm1, %xmm0 2905; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2906; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] 2907; SSE41-NEXT: pxor %xmm0, %xmm0 2908; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 2909; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 2910; SSE41-NEXT: pxor %xmm3, %xmm4 2911; SSE41-NEXT: pmulld %xmm2, %xmm1 2912; SSE41-NEXT: pextrd $3, %xmm1, %eax 2913; SSE41-NEXT: pextrd $2, %xmm1, %ecx 2914; SSE41-NEXT: pextrd $1, %xmm1, %edx 2915; SSE41-NEXT: movd %xmm1, %esi 2916; SSE41-NEXT: psrld $24, %xmm1 2917; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 2918; SSE41-NEXT: pxor %xmm3, %xmm0 2919; SSE41-NEXT: por %xmm4, %xmm0 2920; SSE41-NEXT: movw %ax, 9(%rdi) 2921; SSE41-NEXT: movw %cx, 6(%rdi) 2922; SSE41-NEXT: movw %dx, 3(%rdi) 2923; SSE41-NEXT: movw %si, (%rdi) 2924; SSE41-NEXT: shrl $16, %eax 2925; SSE41-NEXT: movb %al, 11(%rdi) 2926; SSE41-NEXT: shrl $16, %ecx 2927; SSE41-NEXT: movb %cl, 8(%rdi) 2928; SSE41-NEXT: shrl $16, %edx 2929; SSE41-NEXT: movb %dl, 5(%rdi) 2930; SSE41-NEXT: shrl $16, %esi 2931; SSE41-NEXT: movb %sil, 2(%rdi) 2932; SSE41-NEXT: retq 2933; 2934; AVX1-LABEL: umulo_v4i24: 2935; AVX1: # %bb.0: 2936; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 2937; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 2938; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 2939; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,3,3] 2940; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,1,3,3] 2941; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2942; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 2943; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2944; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 2945; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2946; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 2947; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2948; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 2949; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2950; AVX1-NEXT: vpsrld $24, %xmm1, %xmm0 2951; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 2952; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 2953; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 2954; AVX1-NEXT: vpextrd $3, %xmm1, %eax 2955; AVX1-NEXT: movw %ax, 9(%rdi) 2956; AVX1-NEXT: vpextrd $2, %xmm1, %ecx 2957; AVX1-NEXT: movw %cx, 6(%rdi) 2958; AVX1-NEXT: vpextrd $1, %xmm1, %edx 2959; AVX1-NEXT: movw %dx, 3(%rdi) 2960; AVX1-NEXT: vmovd %xmm1, %esi 2961; AVX1-NEXT: movw %si, (%rdi) 2962; AVX1-NEXT: shrl $16, %eax 2963; AVX1-NEXT: movb %al, 11(%rdi) 2964; AVX1-NEXT: shrl $16, %ecx 2965; AVX1-NEXT: movb %cl, 8(%rdi) 2966; AVX1-NEXT: shrl $16, %edx 2967; AVX1-NEXT: movb %dl, 5(%rdi) 2968; AVX1-NEXT: shrl $16, %esi 2969; AVX1-NEXT: movb %sil, 2(%rdi) 2970; AVX1-NEXT: retq 2971; 2972; AVX2-LABEL: umulo_v4i24: 2973; AVX2: # %bb.0: 2974; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 2975; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2976; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 2977; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2978; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2979; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2980; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 2981; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2982; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 2983; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 2984; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 2985; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2986; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 2987; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2988; AVX2-NEXT: vpsrld $24, %xmm1, %xmm0 2989; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 2990; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 2991; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 2992; AVX2-NEXT: vpextrd $3, %xmm1, %eax 2993; AVX2-NEXT: movw %ax, 9(%rdi) 2994; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 2995; AVX2-NEXT: movw %cx, 6(%rdi) 2996; AVX2-NEXT: vpextrd $1, %xmm1, %edx 2997; AVX2-NEXT: movw %dx, 3(%rdi) 2998; AVX2-NEXT: vmovd %xmm1, %esi 2999; AVX2-NEXT: movw %si, (%rdi) 3000; AVX2-NEXT: shrl $16, %eax 3001; AVX2-NEXT: movb %al, 11(%rdi) 3002; AVX2-NEXT: shrl $16, %ecx 3003; AVX2-NEXT: movb %cl, 8(%rdi) 3004; AVX2-NEXT: shrl $16, %edx 3005; AVX2-NEXT: movb %dl, 5(%rdi) 3006; AVX2-NEXT: shrl $16, %esi 3007; AVX2-NEXT: movb %sil, 2(%rdi) 3008; AVX2-NEXT: retq 3009; 3010; AVX512-LABEL: umulo_v4i24: 3011; AVX512: # %bb.0: 3012; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 3013; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 3014; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 3015; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 3016; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 3017; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 3018; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 3019; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 3020; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 3021; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 3022; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0 3023; AVX512-NEXT: vpor %xmm4, %xmm0, %xmm0 3024; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 3025; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3026; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 3027; AVX512-NEXT: vpextrd $3, %xmm1, %eax 3028; AVX512-NEXT: movw %ax, 9(%rdi) 3029; AVX512-NEXT: vpextrd $2, %xmm1, %ecx 3030; AVX512-NEXT: movw %cx, 6(%rdi) 3031; AVX512-NEXT: vpextrd $1, %xmm1, %edx 3032; AVX512-NEXT: movw %dx, 3(%rdi) 3033; AVX512-NEXT: vmovd %xmm1, %esi 3034; AVX512-NEXT: movw %si, (%rdi) 3035; AVX512-NEXT: shrl $16, %eax 3036; AVX512-NEXT: movb %al, 11(%rdi) 3037; AVX512-NEXT: shrl $16, %ecx 3038; AVX512-NEXT: movb %cl, 8(%rdi) 3039; AVX512-NEXT: shrl $16, %edx 3040; AVX512-NEXT: movb %dl, 5(%rdi) 3041; AVX512-NEXT: shrl $16, %esi 3042; AVX512-NEXT: movb %sil, 2(%rdi) 3043; AVX512-NEXT: retq 3044 %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) 3045 %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 3046 %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 3047 %res = sext <4 x i1> %obit to <4 x i32> 3048 store <4 x i24> %val, <4 x i24>* %p2 3049 ret <4 x i32> %res 3050} 3051 3052define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { 3053; SSE2-LABEL: umulo_v4i1: 3054; SSE2: # %bb.0: 3055; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] 3056; SSE2-NEXT: pand %xmm2, %xmm1 3057; SSE2-NEXT: pand %xmm2, %xmm0 3058; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 3059; SSE2-NEXT: pmuludq %xmm1, %xmm0 3060; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 3061; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3062; SSE2-NEXT: pmuludq %xmm2, %xmm1 3063; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 3064; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 3065; SSE2-NEXT: pxor %xmm2, %xmm2 3066; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 3067; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 3068; SSE2-NEXT: pxor %xmm4, %xmm3 3069; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] 3070; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 3071; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 3072; SSE2-NEXT: movdqa %xmm5, %xmm0 3073; SSE2-NEXT: psrld $1, %xmm0 3074; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 3075; SSE2-NEXT: pxor %xmm4, %xmm0 3076; SSE2-NEXT: por %xmm3, %xmm0 3077; SSE2-NEXT: pslld $31, %xmm5 3078; SSE2-NEXT: movmskps %xmm5, %eax 3079; SSE2-NEXT: movb %al, (%rdi) 3080; SSE2-NEXT: retq 3081; 3082; SSSE3-LABEL: umulo_v4i1: 3083; SSSE3: # %bb.0: 3084; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] 3085; SSSE3-NEXT: pand %xmm2, %xmm1 3086; SSSE3-NEXT: pand %xmm2, %xmm0 3087; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 3088; SSSE3-NEXT: pmuludq %xmm1, %xmm0 3089; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 3090; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3091; SSSE3-NEXT: pmuludq %xmm2, %xmm1 3092; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 3093; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 3094; SSSE3-NEXT: pxor %xmm2, %xmm2 3095; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 3096; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 3097; SSSE3-NEXT: pxor %xmm4, %xmm3 3098; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] 3099; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 3100; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 3101; SSSE3-NEXT: movdqa %xmm5, %xmm0 3102; SSSE3-NEXT: psrld $1, %xmm0 3103; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 3104; SSSE3-NEXT: pxor %xmm4, %xmm0 3105; SSSE3-NEXT: por %xmm3, %xmm0 3106; SSSE3-NEXT: pslld $31, %xmm5 3107; SSSE3-NEXT: movmskps %xmm5, %eax 3108; SSSE3-NEXT: movb %al, (%rdi) 3109; SSSE3-NEXT: retq 3110; 3111; SSE41-LABEL: umulo_v4i1: 3112; SSE41: # %bb.0: 3113; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] 3114; SSE41-NEXT: pand %xmm2, %xmm0 3115; SSE41-NEXT: pand %xmm2, %xmm1 3116; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 3117; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 3118; SSE41-NEXT: pmuludq %xmm2, %xmm3 3119; SSE41-NEXT: movdqa %xmm0, %xmm2 3120; SSE41-NEXT: pmuludq %xmm1, %xmm2 3121; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 3122; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] 3123; SSE41-NEXT: pxor %xmm2, %xmm2 3124; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 3125; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 3126; SSE41-NEXT: pxor %xmm3, %xmm4 3127; SSE41-NEXT: pmaddwd %xmm0, %xmm1 3128; SSE41-NEXT: movdqa %xmm1, %xmm0 3129; SSE41-NEXT: psrld $1, %xmm0 3130; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 3131; SSE41-NEXT: pxor %xmm3, %xmm2 3132; SSE41-NEXT: por %xmm4, %xmm2 3133; SSE41-NEXT: pslld $31, %xmm1 3134; SSE41-NEXT: movmskps %xmm1, %eax 3135; SSE41-NEXT: movb %al, (%rdi) 3136; SSE41-NEXT: movdqa %xmm2, %xmm0 3137; SSE41-NEXT: retq 3138; 3139; AVX1-LABEL: umulo_v4i1: 3140; AVX1: # %bb.0: 3141; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1] 3142; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 3143; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 3144; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 3145; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 3146; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 3147; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 3148; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 3149; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3150; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 3151; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 3152; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 3153; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 3154; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm1 3155; AVX1-NEXT: vpsrld $1, %xmm1, %xmm0 3156; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 3157; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 3158; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 3159; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 3160; AVX1-NEXT: vmovmskps %xmm1, %eax 3161; AVX1-NEXT: movb %al, (%rdi) 3162; AVX1-NEXT: retq 3163; 3164; AVX2-LABEL: umulo_v4i1: 3165; AVX2: # %bb.0: 3166; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] 3167; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 3168; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 3169; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 3170; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 3171; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 3172; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 3173; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 3174; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 3175; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 3176; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 3177; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 3178; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 3179; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm1 3180; AVX2-NEXT: vpsrld $1, %xmm1, %xmm0 3181; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 3182; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 3183; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 3184; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 3185; AVX2-NEXT: vmovmskps %xmm1, %eax 3186; AVX2-NEXT: movb %al, (%rdi) 3187; AVX2-NEXT: retq 3188; 3189; AVX512-LABEL: umulo_v4i1: 3190; AVX512: # %bb.0: 3191; AVX512-NEXT: pushq %rbp 3192; AVX512-NEXT: pushq %rbx 3193; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 3194; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 3195; AVX512-NEXT: kshiftrw $3, %k0, %k1 3196; AVX512-NEXT: kmovd %k1, %r8d 3197; AVX512-NEXT: andb $1, %r8b 3198; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 3199; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 3200; AVX512-NEXT: kshiftrw $3, %k1, %k2 3201; AVX512-NEXT: kmovd %k2, %r9d 3202; AVX512-NEXT: andb $1, %r9b 3203; AVX512-NEXT: kshiftrw $2, %k0, %k2 3204; AVX512-NEXT: kmovd %k2, %r10d 3205; AVX512-NEXT: andb $1, %r10b 3206; AVX512-NEXT: kshiftrw $2, %k1, %k2 3207; AVX512-NEXT: kmovd %k2, %r11d 3208; AVX512-NEXT: andb $1, %r11b 3209; AVX512-NEXT: kshiftrw $1, %k0, %k2 3210; AVX512-NEXT: kmovd %k2, %ecx 3211; AVX512-NEXT: andb $1, %cl 3212; AVX512-NEXT: kshiftrw $1, %k1, %k2 3213; AVX512-NEXT: kmovd %k2, %esi 3214; AVX512-NEXT: andb $1, %sil 3215; AVX512-NEXT: kmovd %k0, %eax 3216; AVX512-NEXT: andb $1, %al 3217; AVX512-NEXT: kmovd %k1, %edx 3218; AVX512-NEXT: andb $1, %dl 3219; AVX512-NEXT: # kill: def $al killed $al killed $eax 3220; AVX512-NEXT: mulb %dl 3221; AVX512-NEXT: movl %eax, %edx 3222; AVX512-NEXT: seto %al 3223; AVX512-NEXT: testb $-2, %dl 3224; AVX512-NEXT: setne %bl 3225; AVX512-NEXT: orb %al, %bl 3226; AVX512-NEXT: setne %al 3227; AVX512-NEXT: kmovd %eax, %k1 3228; AVX512-NEXT: movw $-3, %ax 3229; AVX512-NEXT: kmovd %eax, %k0 3230; AVX512-NEXT: kandw %k0, %k1, %k1 3231; AVX512-NEXT: movl %ecx, %eax 3232; AVX512-NEXT: mulb %sil 3233; AVX512-NEXT: movl %eax, %ebp 3234; AVX512-NEXT: seto %al 3235; AVX512-NEXT: testb $-2, %bpl 3236; AVX512-NEXT: setne %bl 3237; AVX512-NEXT: orb %al, %bl 3238; AVX512-NEXT: setne %al 3239; AVX512-NEXT: kmovd %eax, %k2 3240; AVX512-NEXT: kshiftlw $15, %k2, %k2 3241; AVX512-NEXT: kshiftrw $14, %k2, %k2 3242; AVX512-NEXT: korw %k2, %k1, %k2 3243; AVX512-NEXT: movw $-5, %ax 3244; AVX512-NEXT: kmovd %eax, %k1 3245; AVX512-NEXT: kandw %k1, %k2, %k2 3246; AVX512-NEXT: movl %r10d, %eax 3247; AVX512-NEXT: mulb %r11b 3248; AVX512-NEXT: movl %eax, %esi 3249; AVX512-NEXT: seto %al 3250; AVX512-NEXT: testb $-2, %sil 3251; AVX512-NEXT: setne %bl 3252; AVX512-NEXT: orb %al, %bl 3253; AVX512-NEXT: setne %al 3254; AVX512-NEXT: kmovd %eax, %k3 3255; AVX512-NEXT: kshiftlw $2, %k3, %k3 3256; AVX512-NEXT: korw %k3, %k2, %k2 3257; AVX512-NEXT: kshiftlw $13, %k2, %k2 3258; AVX512-NEXT: kshiftrw $13, %k2, %k2 3259; AVX512-NEXT: movl %r8d, %eax 3260; AVX512-NEXT: mulb %r9b 3261; AVX512-NEXT: # kill: def $al killed $al def $eax 3262; AVX512-NEXT: seto %bl 3263; AVX512-NEXT: testb $-2, %al 3264; AVX512-NEXT: setne %cl 3265; AVX512-NEXT: orb %bl, %cl 3266; AVX512-NEXT: setne %cl 3267; AVX512-NEXT: kmovd %ecx, %k3 3268; AVX512-NEXT: kshiftlw $3, %k3, %k3 3269; AVX512-NEXT: korw %k3, %k2, %k2 3270; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3271; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} 3272; AVX512-NEXT: andl $1, %edx 3273; AVX512-NEXT: kmovw %edx, %k2 3274; AVX512-NEXT: kandw %k0, %k2, %k0 3275; AVX512-NEXT: kmovd %ebp, %k2 3276; AVX512-NEXT: kshiftlw $15, %k2, %k2 3277; AVX512-NEXT: kshiftrw $14, %k2, %k2 3278; AVX512-NEXT: korw %k2, %k0, %k0 3279; AVX512-NEXT: kandw %k1, %k0, %k0 3280; AVX512-NEXT: kmovd %esi, %k1 3281; AVX512-NEXT: kshiftlw $15, %k1, %k1 3282; AVX512-NEXT: kshiftrw $13, %k1, %k1 3283; AVX512-NEXT: korw %k1, %k0, %k0 3284; AVX512-NEXT: movw $-9, %cx 3285; AVX512-NEXT: kmovd %ecx, %k1 3286; AVX512-NEXT: kandw %k1, %k0, %k0 3287; AVX512-NEXT: kmovd %eax, %k1 3288; AVX512-NEXT: kshiftlw $15, %k1, %k1 3289; AVX512-NEXT: kshiftrw $12, %k1, %k1 3290; AVX512-NEXT: korw %k1, %k0, %k0 3291; AVX512-NEXT: kmovd %k0, %eax 3292; AVX512-NEXT: movb %al, (%rdi) 3293; AVX512-NEXT: popq %rbx 3294; AVX512-NEXT: popq %rbp 3295; AVX512-NEXT: retq 3296 %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) 3297 %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 3298 %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 3299 %res = sext <4 x i1> %obit to <4 x i32> 3300 store <4 x i1> %val, <4 x i1>* %p2 3301 ret <4 x i32> %res 3302} 3303 3304define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { 3305; SSE2-LABEL: umulo_v2i128: 3306; SSE2: # %bb.0: 3307; SSE2-NEXT: pushq %rbp 3308; SSE2-NEXT: pushq %r15 3309; SSE2-NEXT: pushq %r14 3310; SSE2-NEXT: pushq %r13 3311; SSE2-NEXT: pushq %r12 3312; SSE2-NEXT: pushq %rbx 3313; SSE2-NEXT: movq %r9, %r10 3314; SSE2-NEXT: movq %rcx, %r12 3315; SSE2-NEXT: movq %rdx, %r11 3316; SSE2-NEXT: movq %rsi, %rax 3317; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 3318; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 3319; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 3320; SSE2-NEXT: testq %r10, %r10 3321; SSE2-NEXT: setne %cl 3322; SSE2-NEXT: testq %rsi, %rsi 3323; SSE2-NEXT: setne %r13b 3324; SSE2-NEXT: andb %cl, %r13b 3325; SSE2-NEXT: mulq %r8 3326; SSE2-NEXT: movq %rax, %rsi 3327; SSE2-NEXT: seto %bpl 3328; SSE2-NEXT: movq %r10, %rax 3329; SSE2-NEXT: mulq %rdi 3330; SSE2-NEXT: movq %rax, %rcx 3331; SSE2-NEXT: seto %bl 3332; SSE2-NEXT: orb %bpl, %bl 3333; SSE2-NEXT: addq %rsi, %rcx 3334; SSE2-NEXT: movq %rdi, %rax 3335; SSE2-NEXT: mulq %r8 3336; SSE2-NEXT: movq %rax, %rdi 3337; SSE2-NEXT: movq %rdx, %rsi 3338; SSE2-NEXT: addq %rcx, %rsi 3339; SSE2-NEXT: setb %cl 3340; SSE2-NEXT: orb %bl, %cl 3341; SSE2-NEXT: orb %r13b, %cl 3342; SSE2-NEXT: testq %r9, %r9 3343; SSE2-NEXT: setne %al 3344; SSE2-NEXT: testq %r12, %r12 3345; SSE2-NEXT: setne %r8b 3346; SSE2-NEXT: andb %al, %r8b 3347; SSE2-NEXT: movq %r12, %rax 3348; SSE2-NEXT: mulq %r15 3349; SSE2-NEXT: movq %rax, %rbp 3350; SSE2-NEXT: seto %r10b 3351; SSE2-NEXT: movq %r9, %rax 3352; SSE2-NEXT: mulq %r11 3353; SSE2-NEXT: movq %rax, %rbx 3354; SSE2-NEXT: seto %r9b 3355; SSE2-NEXT: orb %r10b, %r9b 3356; SSE2-NEXT: addq %rbp, %rbx 3357; SSE2-NEXT: movq %r11, %rax 3358; SSE2-NEXT: mulq %r15 3359; SSE2-NEXT: addq %rbx, %rdx 3360; SSE2-NEXT: setb %bl 3361; SSE2-NEXT: orb %r9b, %bl 3362; SSE2-NEXT: orb %r8b, %bl 3363; SSE2-NEXT: movzbl %bl, %ebp 3364; SSE2-NEXT: negl %ebp 3365; SSE2-NEXT: movd %ebp, %xmm1 3366; SSE2-NEXT: movzbl %cl, %ecx 3367; SSE2-NEXT: negl %ecx 3368; SSE2-NEXT: movd %ecx, %xmm0 3369; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3370; SSE2-NEXT: movq %rax, 16(%r14) 3371; SSE2-NEXT: movq %rdi, (%r14) 3372; SSE2-NEXT: movq %rdx, 24(%r14) 3373; SSE2-NEXT: movq %rsi, 8(%r14) 3374; SSE2-NEXT: popq %rbx 3375; SSE2-NEXT: popq %r12 3376; SSE2-NEXT: popq %r13 3377; SSE2-NEXT: popq %r14 3378; SSE2-NEXT: popq %r15 3379; SSE2-NEXT: popq %rbp 3380; SSE2-NEXT: retq 3381; 3382; SSSE3-LABEL: umulo_v2i128: 3383; SSSE3: # %bb.0: 3384; SSSE3-NEXT: pushq %rbp 3385; SSSE3-NEXT: pushq %r15 3386; SSSE3-NEXT: pushq %r14 3387; SSSE3-NEXT: pushq %r13 3388; SSSE3-NEXT: pushq %r12 3389; SSSE3-NEXT: pushq %rbx 3390; SSSE3-NEXT: movq %r9, %r10 3391; SSSE3-NEXT: movq %rcx, %r12 3392; SSSE3-NEXT: movq %rdx, %r11 3393; SSSE3-NEXT: movq %rsi, %rax 3394; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 3395; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 3396; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 3397; SSSE3-NEXT: testq %r10, %r10 3398; SSSE3-NEXT: setne %cl 3399; SSSE3-NEXT: testq %rsi, %rsi 3400; SSSE3-NEXT: setne %r13b 3401; SSSE3-NEXT: andb %cl, %r13b 3402; SSSE3-NEXT: mulq %r8 3403; SSSE3-NEXT: movq %rax, %rsi 3404; SSSE3-NEXT: seto %bpl 3405; SSSE3-NEXT: movq %r10, %rax 3406; SSSE3-NEXT: mulq %rdi 3407; SSSE3-NEXT: movq %rax, %rcx 3408; SSSE3-NEXT: seto %bl 3409; SSSE3-NEXT: orb %bpl, %bl 3410; SSSE3-NEXT: addq %rsi, %rcx 3411; SSSE3-NEXT: movq %rdi, %rax 3412; SSSE3-NEXT: mulq %r8 3413; SSSE3-NEXT: movq %rax, %rdi 3414; SSSE3-NEXT: movq %rdx, %rsi 3415; SSSE3-NEXT: addq %rcx, %rsi 3416; SSSE3-NEXT: setb %cl 3417; SSSE3-NEXT: orb %bl, %cl 3418; SSSE3-NEXT: orb %r13b, %cl 3419; SSSE3-NEXT: testq %r9, %r9 3420; SSSE3-NEXT: setne %al 3421; SSSE3-NEXT: testq %r12, %r12 3422; SSSE3-NEXT: setne %r8b 3423; SSSE3-NEXT: andb %al, %r8b 3424; SSSE3-NEXT: movq %r12, %rax 3425; SSSE3-NEXT: mulq %r15 3426; SSSE3-NEXT: movq %rax, %rbp 3427; SSSE3-NEXT: seto %r10b 3428; SSSE3-NEXT: movq %r9, %rax 3429; SSSE3-NEXT: mulq %r11 3430; SSSE3-NEXT: movq %rax, %rbx 3431; SSSE3-NEXT: seto %r9b 3432; SSSE3-NEXT: orb %r10b, %r9b 3433; SSSE3-NEXT: addq %rbp, %rbx 3434; SSSE3-NEXT: movq %r11, %rax 3435; SSSE3-NEXT: mulq %r15 3436; SSSE3-NEXT: addq %rbx, %rdx 3437; SSSE3-NEXT: setb %bl 3438; SSSE3-NEXT: orb %r9b, %bl 3439; SSSE3-NEXT: orb %r8b, %bl 3440; SSSE3-NEXT: movzbl %bl, %ebp 3441; SSSE3-NEXT: negl %ebp 3442; SSSE3-NEXT: movd %ebp, %xmm1 3443; SSSE3-NEXT: movzbl %cl, %ecx 3444; SSSE3-NEXT: negl %ecx 3445; SSSE3-NEXT: movd %ecx, %xmm0 3446; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3447; SSSE3-NEXT: movq %rax, 16(%r14) 3448; SSSE3-NEXT: movq %rdi, (%r14) 3449; SSSE3-NEXT: movq %rdx, 24(%r14) 3450; SSSE3-NEXT: movq %rsi, 8(%r14) 3451; SSSE3-NEXT: popq %rbx 3452; SSSE3-NEXT: popq %r12 3453; SSSE3-NEXT: popq %r13 3454; SSSE3-NEXT: popq %r14 3455; SSSE3-NEXT: popq %r15 3456; SSSE3-NEXT: popq %rbp 3457; SSSE3-NEXT: retq 3458; 3459; SSE41-LABEL: umulo_v2i128: 3460; SSE41: # %bb.0: 3461; SSE41-NEXT: pushq %rbp 3462; SSE41-NEXT: pushq %r15 3463; SSE41-NEXT: pushq %r14 3464; SSE41-NEXT: pushq %r13 3465; SSE41-NEXT: pushq %r12 3466; SSE41-NEXT: pushq %rbx 3467; SSE41-NEXT: movq %r9, %r10 3468; SSE41-NEXT: movq %rcx, %r12 3469; SSE41-NEXT: movq %rdx, %r11 3470; SSE41-NEXT: movq %rsi, %rax 3471; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 3472; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 3473; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 3474; SSE41-NEXT: testq %r10, %r10 3475; SSE41-NEXT: setne %cl 3476; SSE41-NEXT: testq %rsi, %rsi 3477; SSE41-NEXT: setne %r13b 3478; SSE41-NEXT: andb %cl, %r13b 3479; SSE41-NEXT: mulq %r8 3480; SSE41-NEXT: movq %rax, %rsi 3481; SSE41-NEXT: seto %bpl 3482; SSE41-NEXT: movq %r10, %rax 3483; SSE41-NEXT: mulq %rdi 3484; SSE41-NEXT: movq %rax, %rcx 3485; SSE41-NEXT: seto %bl 3486; SSE41-NEXT: orb %bpl, %bl 3487; SSE41-NEXT: addq %rsi, %rcx 3488; SSE41-NEXT: movq %rdi, %rax 3489; SSE41-NEXT: mulq %r8 3490; SSE41-NEXT: movq %rax, %rdi 3491; SSE41-NEXT: movq %rdx, %rsi 3492; SSE41-NEXT: addq %rcx, %rsi 3493; SSE41-NEXT: setb %cl 3494; SSE41-NEXT: orb %bl, %cl 3495; SSE41-NEXT: orb %r13b, %cl 3496; SSE41-NEXT: testq %r9, %r9 3497; SSE41-NEXT: setne %al 3498; SSE41-NEXT: testq %r12, %r12 3499; SSE41-NEXT: setne %r8b 3500; SSE41-NEXT: andb %al, %r8b 3501; SSE41-NEXT: movq %r12, %rax 3502; SSE41-NEXT: mulq %r15 3503; SSE41-NEXT: movq %rax, %rbp 3504; SSE41-NEXT: seto %r10b 3505; SSE41-NEXT: movq %r9, %rax 3506; SSE41-NEXT: mulq %r11 3507; SSE41-NEXT: movq %rax, %rbx 3508; SSE41-NEXT: seto %r9b 3509; SSE41-NEXT: orb %r10b, %r9b 3510; SSE41-NEXT: addq %rbp, %rbx 3511; SSE41-NEXT: movq %r11, %rax 3512; SSE41-NEXT: mulq %r15 3513; SSE41-NEXT: addq %rbx, %rdx 3514; SSE41-NEXT: setb %bl 3515; SSE41-NEXT: orb %r9b, %bl 3516; SSE41-NEXT: orb %r8b, %bl 3517; SSE41-NEXT: movzbl %bl, %ebp 3518; SSE41-NEXT: negl %ebp 3519; SSE41-NEXT: movzbl %cl, %ecx 3520; SSE41-NEXT: negl %ecx 3521; SSE41-NEXT: movd %ecx, %xmm0 3522; SSE41-NEXT: pinsrd $1, %ebp, %xmm0 3523; SSE41-NEXT: movq %rax, 16(%r14) 3524; SSE41-NEXT: movq %rdi, (%r14) 3525; SSE41-NEXT: movq %rdx, 24(%r14) 3526; SSE41-NEXT: movq %rsi, 8(%r14) 3527; SSE41-NEXT: popq %rbx 3528; SSE41-NEXT: popq %r12 3529; SSE41-NEXT: popq %r13 3530; SSE41-NEXT: popq %r14 3531; SSE41-NEXT: popq %r15 3532; SSE41-NEXT: popq %rbp 3533; SSE41-NEXT: retq 3534; 3535; AVX1-LABEL: umulo_v2i128: 3536; AVX1: # %bb.0: 3537; AVX1-NEXT: pushq %rbp 3538; AVX1-NEXT: pushq %r15 3539; AVX1-NEXT: pushq %r14 3540; AVX1-NEXT: pushq %r13 3541; AVX1-NEXT: pushq %r12 3542; AVX1-NEXT: pushq %rbx 3543; AVX1-NEXT: movq %r9, %r10 3544; AVX1-NEXT: movq %rcx, %r12 3545; AVX1-NEXT: movq %rdx, %r11 3546; AVX1-NEXT: movq %rsi, %rax 3547; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r14 3548; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r15 3549; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r9 3550; AVX1-NEXT: testq %r10, %r10 3551; AVX1-NEXT: setne %cl 3552; AVX1-NEXT: testq %rsi, %rsi 3553; AVX1-NEXT: setne %r13b 3554; AVX1-NEXT: andb %cl, %r13b 3555; AVX1-NEXT: mulq %r8 3556; AVX1-NEXT: movq %rax, %rsi 3557; AVX1-NEXT: seto %bpl 3558; AVX1-NEXT: movq %r10, %rax 3559; AVX1-NEXT: mulq %rdi 3560; AVX1-NEXT: movq %rax, %rcx 3561; AVX1-NEXT: seto %bl 3562; AVX1-NEXT: orb %bpl, %bl 3563; AVX1-NEXT: addq %rsi, %rcx 3564; AVX1-NEXT: movq %rdi, %rax 3565; AVX1-NEXT: mulq %r8 3566; AVX1-NEXT: movq %rax, %rdi 3567; AVX1-NEXT: movq %rdx, %rsi 3568; AVX1-NEXT: addq %rcx, %rsi 3569; AVX1-NEXT: setb %cl 3570; AVX1-NEXT: orb %bl, %cl 3571; AVX1-NEXT: orb %r13b, %cl 3572; AVX1-NEXT: testq %r9, %r9 3573; AVX1-NEXT: setne %al 3574; AVX1-NEXT: testq %r12, %r12 3575; AVX1-NEXT: setne %r8b 3576; AVX1-NEXT: andb %al, %r8b 3577; AVX1-NEXT: movq %r12, %rax 3578; AVX1-NEXT: mulq %r15 3579; AVX1-NEXT: movq %rax, %rbp 3580; AVX1-NEXT: seto %r10b 3581; AVX1-NEXT: movq %r9, %rax 3582; AVX1-NEXT: mulq %r11 3583; AVX1-NEXT: movq %rax, %rbx 3584; AVX1-NEXT: seto %r9b 3585; AVX1-NEXT: orb %r10b, %r9b 3586; AVX1-NEXT: addq %rbp, %rbx 3587; AVX1-NEXT: movq %r11, %rax 3588; AVX1-NEXT: mulq %r15 3589; AVX1-NEXT: addq %rbx, %rdx 3590; AVX1-NEXT: setb %bl 3591; AVX1-NEXT: orb %r9b, %bl 3592; AVX1-NEXT: orb %r8b, %bl 3593; AVX1-NEXT: movzbl %bl, %ebp 3594; AVX1-NEXT: negl %ebp 3595; AVX1-NEXT: movzbl %cl, %ecx 3596; AVX1-NEXT: negl %ecx 3597; AVX1-NEXT: vmovd %ecx, %xmm0 3598; AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 3599; AVX1-NEXT: movq %rax, 16(%r14) 3600; AVX1-NEXT: movq %rdi, (%r14) 3601; AVX1-NEXT: movq %rdx, 24(%r14) 3602; AVX1-NEXT: movq %rsi, 8(%r14) 3603; AVX1-NEXT: popq %rbx 3604; AVX1-NEXT: popq %r12 3605; AVX1-NEXT: popq %r13 3606; AVX1-NEXT: popq %r14 3607; AVX1-NEXT: popq %r15 3608; AVX1-NEXT: popq %rbp 3609; AVX1-NEXT: retq 3610; 3611; AVX2-LABEL: umulo_v2i128: 3612; AVX2: # %bb.0: 3613; AVX2-NEXT: pushq %rbp 3614; AVX2-NEXT: pushq %r15 3615; AVX2-NEXT: pushq %r14 3616; AVX2-NEXT: pushq %r13 3617; AVX2-NEXT: pushq %r12 3618; AVX2-NEXT: pushq %rbx 3619; AVX2-NEXT: movq %r9, %r10 3620; AVX2-NEXT: movq %rcx, %r12 3621; AVX2-NEXT: movq %rdx, %r11 3622; AVX2-NEXT: movq %rsi, %rax 3623; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14 3624; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15 3625; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9 3626; AVX2-NEXT: testq %r10, %r10 3627; AVX2-NEXT: setne %cl 3628; AVX2-NEXT: testq %rsi, %rsi 3629; AVX2-NEXT: setne %r13b 3630; AVX2-NEXT: andb %cl, %r13b 3631; AVX2-NEXT: mulq %r8 3632; AVX2-NEXT: movq %rax, %rsi 3633; AVX2-NEXT: seto %bpl 3634; AVX2-NEXT: movq %r10, %rax 3635; AVX2-NEXT: mulq %rdi 3636; AVX2-NEXT: movq %rax, %rcx 3637; AVX2-NEXT: seto %bl 3638; AVX2-NEXT: orb %bpl, %bl 3639; AVX2-NEXT: addq %rsi, %rcx 3640; AVX2-NEXT: movq %rdi, %rax 3641; AVX2-NEXT: mulq %r8 3642; AVX2-NEXT: movq %rax, %rdi 3643; AVX2-NEXT: movq %rdx, %rsi 3644; AVX2-NEXT: addq %rcx, %rsi 3645; AVX2-NEXT: setb %cl 3646; AVX2-NEXT: orb %bl, %cl 3647; AVX2-NEXT: orb %r13b, %cl 3648; AVX2-NEXT: testq %r9, %r9 3649; AVX2-NEXT: setne %al 3650; AVX2-NEXT: testq %r12, %r12 3651; AVX2-NEXT: setne %r8b 3652; AVX2-NEXT: andb %al, %r8b 3653; AVX2-NEXT: movq %r12, %rax 3654; AVX2-NEXT: mulq %r15 3655; AVX2-NEXT: movq %rax, %rbp 3656; AVX2-NEXT: seto %r10b 3657; AVX2-NEXT: movq %r9, %rax 3658; AVX2-NEXT: mulq %r11 3659; AVX2-NEXT: movq %rax, %rbx 3660; AVX2-NEXT: seto %r9b 3661; AVX2-NEXT: orb %r10b, %r9b 3662; AVX2-NEXT: addq %rbp, %rbx 3663; AVX2-NEXT: movq %r11, %rax 3664; AVX2-NEXT: mulq %r15 3665; AVX2-NEXT: addq %rbx, %rdx 3666; AVX2-NEXT: setb %bl 3667; AVX2-NEXT: orb %r9b, %bl 3668; AVX2-NEXT: orb %r8b, %bl 3669; AVX2-NEXT: movzbl %bl, %ebp 3670; AVX2-NEXT: negl %ebp 3671; AVX2-NEXT: movzbl %cl, %ecx 3672; AVX2-NEXT: negl %ecx 3673; AVX2-NEXT: vmovd %ecx, %xmm0 3674; AVX2-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 3675; AVX2-NEXT: movq %rax, 16(%r14) 3676; AVX2-NEXT: movq %rdi, (%r14) 3677; AVX2-NEXT: movq %rdx, 24(%r14) 3678; AVX2-NEXT: movq %rsi, 8(%r14) 3679; AVX2-NEXT: popq %rbx 3680; AVX2-NEXT: popq %r12 3681; AVX2-NEXT: popq %r13 3682; AVX2-NEXT: popq %r14 3683; AVX2-NEXT: popq %r15 3684; AVX2-NEXT: popq %rbp 3685; AVX2-NEXT: retq 3686; 3687; AVX512-LABEL: umulo_v2i128: 3688; AVX512: # %bb.0: 3689; AVX512-NEXT: pushq %rbp 3690; AVX512-NEXT: pushq %r15 3691; AVX512-NEXT: pushq %r14 3692; AVX512-NEXT: pushq %r13 3693; AVX512-NEXT: pushq %r12 3694; AVX512-NEXT: pushq %rbx 3695; AVX512-NEXT: movq %rcx, %rax 3696; AVX512-NEXT: movq %rdx, %r12 3697; AVX512-NEXT: movq %rdi, %r11 3698; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 3699; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 3700; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 3701; AVX512-NEXT: testq %r10, %r10 3702; AVX512-NEXT: setne %dl 3703; AVX512-NEXT: testq %rcx, %rcx 3704; AVX512-NEXT: setne %r13b 3705; AVX512-NEXT: andb %dl, %r13b 3706; AVX512-NEXT: mulq %r15 3707; AVX512-NEXT: movq %rax, %rdi 3708; AVX512-NEXT: seto %bpl 3709; AVX512-NEXT: movq %r10, %rax 3710; AVX512-NEXT: mulq %r12 3711; AVX512-NEXT: movq %rax, %rbx 3712; AVX512-NEXT: seto %cl 3713; AVX512-NEXT: orb %bpl, %cl 3714; AVX512-NEXT: addq %rdi, %rbx 3715; AVX512-NEXT: movq %r12, %rax 3716; AVX512-NEXT: mulq %r15 3717; AVX512-NEXT: movq %rax, %r10 3718; AVX512-NEXT: movq %rdx, %r15 3719; AVX512-NEXT: addq %rbx, %r15 3720; AVX512-NEXT: setb %al 3721; AVX512-NEXT: orb %cl, %al 3722; AVX512-NEXT: orb %r13b, %al 3723; AVX512-NEXT: kmovd %eax, %k0 3724; AVX512-NEXT: testq %r9, %r9 3725; AVX512-NEXT: setne %al 3726; AVX512-NEXT: testq %rsi, %rsi 3727; AVX512-NEXT: setne %cl 3728; AVX512-NEXT: andb %al, %cl 3729; AVX512-NEXT: movq %rsi, %rax 3730; AVX512-NEXT: mulq %r8 3731; AVX512-NEXT: movq %rax, %rsi 3732; AVX512-NEXT: seto %bpl 3733; AVX512-NEXT: movq %r9, %rax 3734; AVX512-NEXT: mulq %r11 3735; AVX512-NEXT: movq %rax, %rdi 3736; AVX512-NEXT: seto %bl 3737; AVX512-NEXT: orb %bpl, %bl 3738; AVX512-NEXT: addq %rsi, %rdi 3739; AVX512-NEXT: movq %r11, %rax 3740; AVX512-NEXT: mulq %r8 3741; AVX512-NEXT: addq %rdi, %rdx 3742; AVX512-NEXT: setb %sil 3743; AVX512-NEXT: orb %bl, %sil 3744; AVX512-NEXT: orb %cl, %sil 3745; AVX512-NEXT: andl $1, %esi 3746; AVX512-NEXT: kmovw %esi, %k1 3747; AVX512-NEXT: kshiftlw $1, %k0, %k0 3748; AVX512-NEXT: korw %k0, %k1, %k1 3749; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3750; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 3751; AVX512-NEXT: movq %r10, 16(%r14) 3752; AVX512-NEXT: movq %rax, (%r14) 3753; AVX512-NEXT: movq %r15, 24(%r14) 3754; AVX512-NEXT: movq %rdx, 8(%r14) 3755; AVX512-NEXT: popq %rbx 3756; AVX512-NEXT: popq %r12 3757; AVX512-NEXT: popq %r13 3758; AVX512-NEXT: popq %r14 3759; AVX512-NEXT: popq %r15 3760; AVX512-NEXT: popq %rbp 3761; AVX512-NEXT: retq 3762 %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) 3763 %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 3764 %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 3765 %res = sext <2 x i1> %obit to <2 x i32> 3766 store <2 x i128> %val, <2 x i128>* %p2 3767 ret <2 x i32> %res 3768} 3769