1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 8 9declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>) 10declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) 11declare {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32>, <3 x i32>) 12declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>) 13declare {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32>, <6 x i32>) 14declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>) 15declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>) 16 17declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>) 18declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>) 19declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>) 20 21declare {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24>, <4 x i24>) 22declare {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1>, <4 x i1>) 23declare {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128>, <2 x i128>) 24 25define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { 26; SSE-LABEL: ssubo_v1i32: 27; SSE: # %bb.0: 28; SSE-NEXT: xorl %eax, %eax 29; SSE-NEXT: subl %esi, %edi 30; SSE-NEXT: seto %al 31; SSE-NEXT: negl %eax 32; SSE-NEXT: movl %edi, (%rdx) 33; SSE-NEXT: retq 34; 35; AVX-LABEL: ssubo_v1i32: 36; AVX: # %bb.0: 37; AVX-NEXT: xorl %eax, %eax 38; AVX-NEXT: subl %esi, %edi 39; AVX-NEXT: seto %al 40; AVX-NEXT: negl %eax 41; AVX-NEXT: movl %edi, (%rdx) 42; AVX-NEXT: retq 43 %t = call {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) 44 %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 45 %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 46 %res = sext <1 x i1> %obit to <1 x i32> 47 store <1 x i32> %val, <1 x i32>* %p2 48 ret <1 x i32> %res 49} 50 51define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { 52; SSE-LABEL: ssubo_v2i32: 53; SSE: # %bb.0: 54; SSE-NEXT: pxor %xmm2, %xmm2 55; SSE-NEXT: movdqa %xmm0, %xmm3 56; SSE-NEXT: psubd %xmm1, %xmm3 57; SSE-NEXT: pcmpgtd %xmm2, %xmm1 58; SSE-NEXT: pcmpgtd %xmm3, %xmm0 59; SSE-NEXT: pxor %xmm1, %xmm0 60; SSE-NEXT: movq %xmm3, (%rdi) 61; SSE-NEXT: retq 62; 63; AVX1-LABEL: ssubo_v2i32: 64; AVX1: # %bb.0: 65; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 66; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 67; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 68; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 69; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 70; AVX1-NEXT: vmovq %xmm1, (%rdi) 71; AVX1-NEXT: retq 72; 73; AVX2-LABEL: ssubo_v2i32: 74; AVX2: # %bb.0: 75; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 76; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 77; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 78; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 79; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 80; AVX2-NEXT: vmovq %xmm1, (%rdi) 81; AVX2-NEXT: retq 82; 83; AVX512-LABEL: ssubo_v2i32: 84; AVX512: # %bb.0: 85; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 86; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 87; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 88; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 89; AVX512-NEXT: kxorw %k1, %k0, %k1 90; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 91; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 92; AVX512-NEXT: vmovq %xmm1, (%rdi) 93; AVX512-NEXT: retq 94 %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) 95 %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 96 %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 97 %res = sext <2 x i1> %obit to <2 x i32> 98 store <2 x i32> %val, <2 x i32>* %p2 99 ret <2 x i32> %res 100} 101 102define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { 103; SSE2-LABEL: ssubo_v3i32: 104; SSE2: # %bb.0: 105; SSE2-NEXT: pxor %xmm2, %xmm2 106; SSE2-NEXT: movdqa %xmm0, %xmm3 107; SSE2-NEXT: psubd %xmm1, %xmm3 108; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 109; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 110; SSE2-NEXT: pxor %xmm1, %xmm0 111; SSE2-NEXT: movq %xmm3, (%rdi) 112; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] 113; SSE2-NEXT: movd %xmm1, 8(%rdi) 114; SSE2-NEXT: retq 115; 116; SSSE3-LABEL: ssubo_v3i32: 117; SSSE3: # %bb.0: 118; SSSE3-NEXT: pxor %xmm2, %xmm2 119; SSSE3-NEXT: movdqa %xmm0, %xmm3 120; SSSE3-NEXT: psubd %xmm1, %xmm3 121; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 122; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 123; SSSE3-NEXT: pxor %xmm1, %xmm0 124; SSSE3-NEXT: movq %xmm3, (%rdi) 125; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] 126; SSSE3-NEXT: movd %xmm1, 8(%rdi) 127; SSSE3-NEXT: retq 128; 129; SSE41-LABEL: ssubo_v3i32: 130; SSE41: # %bb.0: 131; SSE41-NEXT: pxor %xmm2, %xmm2 132; SSE41-NEXT: movdqa %xmm0, %xmm3 133; SSE41-NEXT: psubd %xmm1, %xmm3 134; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 135; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 136; SSE41-NEXT: pxor %xmm1, %xmm0 137; SSE41-NEXT: pextrd $2, %xmm3, 8(%rdi) 138; SSE41-NEXT: movq %xmm3, (%rdi) 139; SSE41-NEXT: retq 140; 141; AVX1-LABEL: ssubo_v3i32: 142; AVX1: # %bb.0: 143; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 144; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 145; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 146; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 147; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 148; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) 149; AVX1-NEXT: vmovq %xmm1, (%rdi) 150; AVX1-NEXT: retq 151; 152; AVX2-LABEL: ssubo_v3i32: 153; AVX2: # %bb.0: 154; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 155; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 156; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 157; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 158; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 159; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) 160; AVX2-NEXT: vmovq %xmm1, (%rdi) 161; AVX2-NEXT: retq 162; 163; AVX512-LABEL: ssubo_v3i32: 164; AVX512: # %bb.0: 165; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 166; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 167; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 168; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 169; AVX512-NEXT: kxorw %k1, %k0, %k1 170; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 171; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 172; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) 173; AVX512-NEXT: vmovq %xmm1, (%rdi) 174; AVX512-NEXT: retq 175 %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) 176 %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 177 %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 178 %res = sext <3 x i1> %obit to <3 x i32> 179 store <3 x i32> %val, <3 x i32>* %p2 180 ret <3 x i32> %res 181} 182 183define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { 184; SSE-LABEL: ssubo_v4i32: 185; SSE: # %bb.0: 186; SSE-NEXT: pxor %xmm2, %xmm2 187; SSE-NEXT: movdqa %xmm0, %xmm3 188; SSE-NEXT: psubd %xmm1, %xmm3 189; SSE-NEXT: pcmpgtd %xmm2, %xmm1 190; SSE-NEXT: pcmpgtd %xmm3, %xmm0 191; SSE-NEXT: pxor %xmm1, %xmm0 192; SSE-NEXT: movdqa %xmm3, (%rdi) 193; SSE-NEXT: retq 194; 195; AVX1-LABEL: ssubo_v4i32: 196; AVX1: # %bb.0: 197; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 198; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 199; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 200; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 201; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 202; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 203; AVX1-NEXT: retq 204; 205; AVX2-LABEL: ssubo_v4i32: 206; AVX2: # %bb.0: 207; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 208; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 209; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 210; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 211; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 212; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 213; AVX2-NEXT: retq 214; 215; AVX512-LABEL: ssubo_v4i32: 216; AVX512: # %bb.0: 217; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 218; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 219; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 220; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 221; AVX512-NEXT: kxorw %k1, %k0, %k1 222; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 223; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 224; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 225; AVX512-NEXT: retq 226 %t = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) 227 %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 228 %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 229 %res = sext <4 x i1> %obit to <4 x i32> 230 store <4 x i32> %val, <4 x i32>* %p2 231 ret <4 x i32> %res 232} 233 234define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { 235; SSE2-LABEL: ssubo_v6i32: 236; SSE2: # %bb.0: 237; SSE2-NEXT: movq %rdi, %rax 238; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 239; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 240; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 241; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 242; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 243; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 244; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 245; SSE2-NEXT: movd %r8d, %xmm1 246; SSE2-NEXT: movd %ecx, %xmm2 247; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 248; SSE2-NEXT: movd %edx, %xmm1 249; SSE2-NEXT: movd %esi, %xmm3 250; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 251; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] 252; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 253; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 254; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 255; SSE2-NEXT: movd %r9d, %xmm1 256; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero 257; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 258; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx 259; SSE2-NEXT: movdqa %xmm3, %xmm4 260; SSE2-NEXT: psubd %xmm0, %xmm4 261; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 262; SSE2-NEXT: pxor %xmm5, %xmm5 263; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 264; SSE2-NEXT: pxor %xmm3, %xmm0 265; SSE2-NEXT: movdqa %xmm1, %xmm3 266; SSE2-NEXT: psubd %xmm2, %xmm3 267; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 268; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 269; SSE2-NEXT: pxor %xmm1, %xmm2 270; SSE2-NEXT: movq %xmm3, 16(%rcx) 271; SSE2-NEXT: movdqa %xmm4, (%rcx) 272; SSE2-NEXT: movq %xmm2, 16(%rdi) 273; SSE2-NEXT: movdqa %xmm0, (%rdi) 274; SSE2-NEXT: retq 275; 276; SSSE3-LABEL: ssubo_v6i32: 277; SSSE3: # %bb.0: 278; SSSE3-NEXT: movq %rdi, %rax 279; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 280; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 281; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 282; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 283; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 284; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 285; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 286; SSSE3-NEXT: movd %r8d, %xmm1 287; SSSE3-NEXT: movd %ecx, %xmm2 288; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 289; SSSE3-NEXT: movd %edx, %xmm1 290; SSSE3-NEXT: movd %esi, %xmm3 291; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 292; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] 293; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 294; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 295; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 296; SSSE3-NEXT: movd %r9d, %xmm1 297; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero 298; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 299; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx 300; SSSE3-NEXT: movdqa %xmm3, %xmm4 301; SSSE3-NEXT: psubd %xmm0, %xmm4 302; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 303; SSSE3-NEXT: pxor %xmm5, %xmm5 304; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 305; SSSE3-NEXT: pxor %xmm3, %xmm0 306; SSSE3-NEXT: movdqa %xmm1, %xmm3 307; SSSE3-NEXT: psubd %xmm2, %xmm3 308; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 309; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 310; SSSE3-NEXT: pxor %xmm1, %xmm2 311; SSSE3-NEXT: movq %xmm3, 16(%rcx) 312; SSSE3-NEXT: movdqa %xmm4, (%rcx) 313; SSSE3-NEXT: movq %xmm2, 16(%rdi) 314; SSSE3-NEXT: movdqa %xmm0, (%rdi) 315; SSSE3-NEXT: retq 316; 317; SSE41-LABEL: ssubo_v6i32: 318; SSE41: # %bb.0: 319; SSE41-NEXT: movq %rdi, %rax 320; SSE41-NEXT: movd %esi, %xmm1 321; SSE41-NEXT: pinsrd $1, %edx, %xmm1 322; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 323; SSE41-NEXT: pinsrd $3, %r8d, %xmm1 324; SSE41-NEXT: movd %r9d, %xmm0 325; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 326; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 327; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 328; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 329; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 330; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 331; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 332; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx 333; SSE41-NEXT: movdqa %xmm1, %xmm4 334; SSE41-NEXT: psubd %xmm3, %xmm4 335; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 336; SSE41-NEXT: pxor %xmm5, %xmm5 337; SSE41-NEXT: pcmpgtd %xmm5, %xmm3 338; SSE41-NEXT: pxor %xmm1, %xmm3 339; SSE41-NEXT: movdqa %xmm0, %xmm1 340; SSE41-NEXT: psubd %xmm2, %xmm1 341; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 342; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 343; SSE41-NEXT: pxor %xmm2, %xmm0 344; SSE41-NEXT: movq %xmm1, 16(%rcx) 345; SSE41-NEXT: movdqa %xmm4, (%rcx) 346; SSE41-NEXT: movq %xmm0, 16(%rdi) 347; SSE41-NEXT: movdqa %xmm3, (%rdi) 348; SSE41-NEXT: retq 349; 350; AVX1-LABEL: ssubo_v6i32: 351; AVX1: # %bb.0: 352; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 353; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 354; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 355; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 356; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 357; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 358; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 359; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 360; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 361; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 362; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 363; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 364; AVX1-NEXT: vmovq %xmm2, 16(%rdi) 365; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 366; AVX1-NEXT: retq 367; 368; AVX2-LABEL: ssubo_v6i32: 369; AVX2: # %bb.0: 370; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 371; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 372; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 373; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 374; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 375; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 376; AVX2-NEXT: vmovq %xmm2, 16(%rdi) 377; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 378; AVX2-NEXT: retq 379; 380; AVX512-LABEL: ssubo_v6i32: 381; AVX512: # %bb.0: 382; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 383; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 384; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 385; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 386; AVX512-NEXT: kxorw %k1, %k0, %k1 387; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 388; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 389; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 390; AVX512-NEXT: vmovq %xmm2, 16(%rdi) 391; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 392; AVX512-NEXT: retq 393 %t = call {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) 394 %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 395 %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 396 %res = sext <6 x i1> %obit to <6 x i32> 397 store <6 x i32> %val, <6 x i32>* %p2 398 ret <6 x i32> %res 399} 400 401define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { 402; SSE-LABEL: ssubo_v8i32: 403; SSE: # %bb.0: 404; SSE-NEXT: pxor %xmm4, %xmm4 405; SSE-NEXT: movdqa %xmm0, %xmm5 406; SSE-NEXT: psubd %xmm2, %xmm5 407; SSE-NEXT: pcmpgtd %xmm4, %xmm2 408; SSE-NEXT: pcmpgtd %xmm5, %xmm0 409; SSE-NEXT: pxor %xmm2, %xmm0 410; SSE-NEXT: movdqa %xmm1, %xmm2 411; SSE-NEXT: psubd %xmm3, %xmm2 412; SSE-NEXT: pcmpgtd %xmm4, %xmm3 413; SSE-NEXT: pcmpgtd %xmm2, %xmm1 414; SSE-NEXT: pxor %xmm3, %xmm1 415; SSE-NEXT: movdqa %xmm2, 16(%rdi) 416; SSE-NEXT: movdqa %xmm5, (%rdi) 417; SSE-NEXT: retq 418; 419; AVX1-LABEL: ssubo_v8i32: 420; AVX1: # %bb.0: 421; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 422; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 423; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4 424; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 425; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 426; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 427; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 428; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 429; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 430; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 431; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 432; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 433; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) 434; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 435; AVX1-NEXT: retq 436; 437; AVX2-LABEL: ssubo_v8i32: 438; AVX2: # %bb.0: 439; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 440; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 441; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 442; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 443; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 444; AVX2-NEXT: vmovdqa %ymm1, (%rdi) 445; AVX2-NEXT: retq 446; 447; AVX512-LABEL: ssubo_v8i32: 448; AVX512: # %bb.0: 449; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 450; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 451; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 452; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 453; AVX512-NEXT: kxorw %k1, %k0, %k1 454; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 455; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 456; AVX512-NEXT: vmovdqa %ymm1, (%rdi) 457; AVX512-NEXT: retq 458 %t = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) 459 %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 460 %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 461 %res = sext <8 x i1> %obit to <8 x i32> 462 store <8 x i32> %val, <8 x i32>* %p2 463 ret <8 x i32> %res 464} 465 466define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { 467; SSE-LABEL: ssubo_v16i32: 468; SSE: # %bb.0: 469; SSE-NEXT: pxor %xmm9, %xmm9 470; SSE-NEXT: movdqa %xmm0, %xmm8 471; SSE-NEXT: psubd %xmm4, %xmm8 472; SSE-NEXT: pcmpgtd %xmm9, %xmm4 473; SSE-NEXT: pcmpgtd %xmm8, %xmm0 474; SSE-NEXT: pxor %xmm4, %xmm0 475; SSE-NEXT: movdqa %xmm1, %xmm4 476; SSE-NEXT: psubd %xmm5, %xmm4 477; SSE-NEXT: pcmpgtd %xmm9, %xmm5 478; SSE-NEXT: pcmpgtd %xmm4, %xmm1 479; SSE-NEXT: pxor %xmm5, %xmm1 480; SSE-NEXT: movdqa %xmm2, %xmm5 481; SSE-NEXT: psubd %xmm6, %xmm5 482; SSE-NEXT: pcmpgtd %xmm9, %xmm6 483; SSE-NEXT: pcmpgtd %xmm5, %xmm2 484; SSE-NEXT: pxor %xmm6, %xmm2 485; SSE-NEXT: movdqa %xmm3, %xmm6 486; SSE-NEXT: psubd %xmm7, %xmm6 487; SSE-NEXT: pcmpgtd %xmm9, %xmm7 488; SSE-NEXT: pcmpgtd %xmm6, %xmm3 489; SSE-NEXT: pxor %xmm7, %xmm3 490; SSE-NEXT: movdqa %xmm6, 48(%rdi) 491; SSE-NEXT: movdqa %xmm5, 32(%rdi) 492; SSE-NEXT: movdqa %xmm4, 16(%rdi) 493; SSE-NEXT: movdqa %xmm8, (%rdi) 494; SSE-NEXT: retq 495; 496; AVX1-LABEL: ssubo_v16i32: 497; AVX1: # %bb.0: 498; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 499; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 500; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6 501; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 502; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm8 503; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7 504; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 505; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7 506; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 507; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 508; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1 509; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 510; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 511; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7 512; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 513; AVX1-NEXT: vpsubd %xmm6, %xmm4, %xmm6 514; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4 515; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4 516; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5 517; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 518; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 519; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 520; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 521; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 522; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 523; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 524; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 525; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 526; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 527; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 528; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 529; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 530; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 531; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) 532; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) 533; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) 534; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 535; AVX1-NEXT: retq 536; 537; AVX2-LABEL: ssubo_v16i32: 538; AVX2: # %bb.0: 539; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 540; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm5 541; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 542; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 543; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1 544; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 545; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 546; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm4 547; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 548; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 549; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 550; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 551; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 552; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 553; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 554; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 555; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 556; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) 557; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 558; AVX2-NEXT: retq 559; 560; AVX512-LABEL: ssubo_v16i32: 561; AVX512: # %bb.0: 562; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 563; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 564; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 565; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 566; AVX512-NEXT: kxorw %k1, %k0, %k1 567; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 568; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) 569; AVX512-NEXT: retq 570 %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) 571 %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 572 %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 573 %res = sext <16 x i1> %obit to <16 x i32> 574 store <16 x i32> %val, <16 x i32>* %p2 575 ret <16 x i32> %res 576} 577 578define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { 579; SSE2-LABEL: ssubo_v16i8: 580; SSE2: # %bb.0: 581; SSE2-NEXT: movdqa %xmm0, %xmm2 582; SSE2-NEXT: psubsb %xmm1, %xmm2 583; SSE2-NEXT: psubb %xmm1, %xmm0 584; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 585; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 586; SSE2-NEXT: pxor %xmm2, %xmm3 587; SSE2-NEXT: movdqa %xmm3, %xmm1 588; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 589; SSE2-NEXT: movdqa %xmm1, %xmm4 590; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 591; SSE2-NEXT: pslld $31, %xmm4 592; SSE2-NEXT: psrad $31, %xmm4 593; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 594; SSE2-NEXT: pslld $31, %xmm1 595; SSE2-NEXT: psrad $31, %xmm1 596; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 597; SSE2-NEXT: movdqa %xmm3, %xmm2 598; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 599; SSE2-NEXT: pslld $31, %xmm2 600; SSE2-NEXT: psrad $31, %xmm2 601; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 602; SSE2-NEXT: pslld $31, %xmm3 603; SSE2-NEXT: psrad $31, %xmm3 604; SSE2-NEXT: movdqa %xmm0, (%rdi) 605; SSE2-NEXT: movdqa %xmm4, %xmm0 606; SSE2-NEXT: retq 607; 608; SSSE3-LABEL: ssubo_v16i8: 609; SSSE3: # %bb.0: 610; SSSE3-NEXT: movdqa %xmm0, %xmm2 611; SSSE3-NEXT: psubsb %xmm1, %xmm2 612; SSSE3-NEXT: psubb %xmm1, %xmm0 613; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 614; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 615; SSSE3-NEXT: pxor %xmm2, %xmm3 616; SSSE3-NEXT: movdqa %xmm3, %xmm1 617; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 618; SSSE3-NEXT: movdqa %xmm1, %xmm4 619; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 620; SSSE3-NEXT: pslld $31, %xmm4 621; SSSE3-NEXT: psrad $31, %xmm4 622; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 623; SSSE3-NEXT: pslld $31, %xmm1 624; SSSE3-NEXT: psrad $31, %xmm1 625; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 626; SSSE3-NEXT: movdqa %xmm3, %xmm2 627; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 628; SSSE3-NEXT: pslld $31, %xmm2 629; SSSE3-NEXT: psrad $31, %xmm2 630; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 631; SSSE3-NEXT: pslld $31, %xmm3 632; SSSE3-NEXT: psrad $31, %xmm3 633; SSSE3-NEXT: movdqa %xmm0, (%rdi) 634; SSSE3-NEXT: movdqa %xmm4, %xmm0 635; SSSE3-NEXT: retq 636; 637; SSE41-LABEL: ssubo_v16i8: 638; SSE41: # %bb.0: 639; SSE41-NEXT: movdqa %xmm0, %xmm2 640; SSE41-NEXT: psubsb %xmm1, %xmm2 641; SSE41-NEXT: psubb %xmm1, %xmm0 642; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 643; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 644; SSE41-NEXT: pxor %xmm2, %xmm3 645; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 646; SSE41-NEXT: pslld $31, %xmm4 647; SSE41-NEXT: psrad $31, %xmm4 648; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 649; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 650; SSE41-NEXT: pslld $31, %xmm1 651; SSE41-NEXT: psrad $31, %xmm1 652; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 653; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 654; SSE41-NEXT: pslld $31, %xmm2 655; SSE41-NEXT: psrad $31, %xmm2 656; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 657; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 658; SSE41-NEXT: pslld $31, %xmm3 659; SSE41-NEXT: psrad $31, %xmm3 660; SSE41-NEXT: movdqa %xmm0, (%rdi) 661; SSE41-NEXT: movdqa %xmm4, %xmm0 662; SSE41-NEXT: retq 663; 664; AVX1-LABEL: ssubo_v16i8: 665; AVX1: # %bb.0: 666; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 667; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm3 668; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm0 669; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 670; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 671; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 672; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 673; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 674; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 675; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 676; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 677; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 678; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 679; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 680; AVX1-NEXT: vmovdqa %xmm3, (%rdi) 681; AVX1-NEXT: retq 682; 683; AVX2-LABEL: ssubo_v16i8: 684; AVX2: # %bb.0: 685; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 686; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm3 687; AVX2-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm0 688; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 689; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 690; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 691; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 692; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 693; AVX2-NEXT: vmovdqa %xmm3, (%rdi) 694; AVX2-NEXT: retq 695; 696; AVX512-LABEL: ssubo_v16i8: 697; AVX512: # %bb.0: 698; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 699; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 700; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 701; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 702; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 703; AVX512-NEXT: retq 704 %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) 705 %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 706 %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 707 %res = sext <16 x i1> %obit to <16 x i32> 708 store <16 x i8> %val, <16 x i8>* %p2 709 ret <16 x i32> %res 710} 711 712define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { 713; SSE2-LABEL: ssubo_v8i16: 714; SSE2: # %bb.0: 715; SSE2-NEXT: movdqa %xmm0, %xmm2 716; SSE2-NEXT: psubsw %xmm1, %xmm2 717; SSE2-NEXT: psubw %xmm1, %xmm0 718; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 719; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 720; SSE2-NEXT: pxor %xmm2, %xmm1 721; SSE2-NEXT: movdqa %xmm1, %xmm2 722; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 723; SSE2-NEXT: pslld $31, %xmm2 724; SSE2-NEXT: psrad $31, %xmm2 725; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 726; SSE2-NEXT: pslld $31, %xmm1 727; SSE2-NEXT: psrad $31, %xmm1 728; SSE2-NEXT: movdqa %xmm0, (%rdi) 729; SSE2-NEXT: movdqa %xmm2, %xmm0 730; SSE2-NEXT: retq 731; 732; SSSE3-LABEL: ssubo_v8i16: 733; SSSE3: # %bb.0: 734; SSSE3-NEXT: movdqa %xmm0, %xmm2 735; SSSE3-NEXT: psubsw %xmm1, %xmm2 736; SSSE3-NEXT: psubw %xmm1, %xmm0 737; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 738; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 739; SSSE3-NEXT: pxor %xmm2, %xmm1 740; SSSE3-NEXT: movdqa %xmm1, %xmm2 741; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 742; SSSE3-NEXT: pslld $31, %xmm2 743; SSSE3-NEXT: psrad $31, %xmm2 744; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 745; SSSE3-NEXT: pslld $31, %xmm1 746; SSSE3-NEXT: psrad $31, %xmm1 747; SSSE3-NEXT: movdqa %xmm0, (%rdi) 748; SSSE3-NEXT: movdqa %xmm2, %xmm0 749; SSSE3-NEXT: retq 750; 751; SSE41-LABEL: ssubo_v8i16: 752; SSE41: # %bb.0: 753; SSE41-NEXT: movdqa %xmm0, %xmm2 754; SSE41-NEXT: psubsw %xmm1, %xmm2 755; SSE41-NEXT: psubw %xmm1, %xmm0 756; SSE41-NEXT: pcmpeqw %xmm0, %xmm2 757; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 758; SSE41-NEXT: pxor %xmm2, %xmm1 759; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 760; SSE41-NEXT: pslld $31, %xmm2 761; SSE41-NEXT: psrad $31, %xmm2 762; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 763; SSE41-NEXT: pslld $31, %xmm1 764; SSE41-NEXT: psrad $31, %xmm1 765; SSE41-NEXT: movdqa %xmm0, (%rdi) 766; SSE41-NEXT: movdqa %xmm2, %xmm0 767; SSE41-NEXT: retq 768; 769; AVX1-LABEL: ssubo_v8i16: 770; AVX1: # %bb.0: 771; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 772; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1 773; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm0 774; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 775; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 776; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 777; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 778; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 779; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 780; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 781; AVX1-NEXT: retq 782; 783; AVX2-LABEL: ssubo_v8i16: 784; AVX2: # %bb.0: 785; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 786; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1 787; AVX2-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm0 788; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 789; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 790; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 791; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 792; AVX2-NEXT: retq 793; 794; AVX512-LABEL: ssubo_v8i16: 795; AVX512: # %bb.0: 796; AVX512-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 797; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1 798; AVX512-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 799; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 800; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 801; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 802; AVX512-NEXT: retq 803 %t = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) 804 %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 805 %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 806 %res = sext <8 x i1> %obit to <8 x i32> 807 store <8 x i16> %val, <8 x i16>* %p2 808 ret <8 x i32> %res 809} 810 811define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { 812; SSE-LABEL: ssubo_v2i64: 813; SSE: # %bb.0: 814; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 815; SSE-NEXT: movdqa %xmm0, %xmm3 816; SSE-NEXT: pxor %xmm2, %xmm3 817; SSE-NEXT: psubq %xmm1, %xmm0 818; SSE-NEXT: movdqa %xmm0, (%rdi) 819; SSE-NEXT: pxor %xmm2, %xmm0 820; SSE-NEXT: movdqa %xmm3, %xmm4 821; SSE-NEXT: pcmpgtd %xmm0, %xmm4 822; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 823; SSE-NEXT: pcmpeqd %xmm3, %xmm0 824; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 825; SSE-NEXT: pand %xmm5, %xmm0 826; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 827; SSE-NEXT: por %xmm0, %xmm3 828; SSE-NEXT: pxor %xmm2, %xmm1 829; SSE-NEXT: movdqa %xmm1, %xmm0 830; SSE-NEXT: pcmpgtd %xmm2, %xmm0 831; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] 832; SSE-NEXT: pcmpeqd %xmm2, %xmm1 833; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 834; SSE-NEXT: pand %xmm4, %xmm1 835; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 836; SSE-NEXT: por %xmm1, %xmm0 837; SSE-NEXT: pxor %xmm3, %xmm0 838; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 839; SSE-NEXT: retq 840; 841; AVX1-LABEL: ssubo_v2i64: 842; AVX1: # %bb.0: 843; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 844; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 845; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 846; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 847; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 848; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 849; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 850; AVX1-NEXT: retq 851; 852; AVX2-LABEL: ssubo_v2i64: 853; AVX2: # %bb.0: 854; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 855; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 856; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 857; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 858; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 859; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 860; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 861; AVX2-NEXT: retq 862; 863; AVX512-LABEL: ssubo_v2i64: 864; AVX512: # %bb.0: 865; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 866; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0 867; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 868; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 869; AVX512-NEXT: kxorw %k1, %k0, %k1 870; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 871; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 872; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 873; AVX512-NEXT: retq 874 %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) 875 %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 876 %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 877 %res = sext <2 x i1> %obit to <2 x i32> 878 store <2 x i64> %val, <2 x i64>* %p2 879 ret <2 x i32> %res 880} 881 882define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { 883; SSE2-LABEL: ssubo_v4i24: 884; SSE2: # %bb.0: 885; SSE2-NEXT: movdqa %xmm0, %xmm2 886; SSE2-NEXT: pslld $8, %xmm1 887; SSE2-NEXT: psrad $8, %xmm1 888; SSE2-NEXT: pslld $8, %xmm2 889; SSE2-NEXT: psrad $8, %xmm2 890; SSE2-NEXT: psubd %xmm1, %xmm2 891; SSE2-NEXT: movdqa %xmm2, %xmm0 892; SSE2-NEXT: pslld $8, %xmm0 893; SSE2-NEXT: psrad $8, %xmm0 894; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 895; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 896; SSE2-NEXT: pxor %xmm1, %xmm0 897; SSE2-NEXT: movd %xmm2, %eax 898; SSE2-NEXT: movw %ax, (%rdi) 899; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 900; SSE2-NEXT: movd %xmm1, %ecx 901; SSE2-NEXT: movw %cx, 9(%rdi) 902; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 903; SSE2-NEXT: movd %xmm1, %edx 904; SSE2-NEXT: movw %dx, 6(%rdi) 905; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 906; SSE2-NEXT: movd %xmm1, %esi 907; SSE2-NEXT: movw %si, 3(%rdi) 908; SSE2-NEXT: shrl $16, %eax 909; SSE2-NEXT: movb %al, 2(%rdi) 910; SSE2-NEXT: shrl $16, %ecx 911; SSE2-NEXT: movb %cl, 11(%rdi) 912; SSE2-NEXT: shrl $16, %edx 913; SSE2-NEXT: movb %dl, 8(%rdi) 914; SSE2-NEXT: shrl $16, %esi 915; SSE2-NEXT: movb %sil, 5(%rdi) 916; SSE2-NEXT: retq 917; 918; SSSE3-LABEL: ssubo_v4i24: 919; SSSE3: # %bb.0: 920; SSSE3-NEXT: movdqa %xmm0, %xmm2 921; SSSE3-NEXT: pslld $8, %xmm1 922; SSSE3-NEXT: psrad $8, %xmm1 923; SSSE3-NEXT: pslld $8, %xmm2 924; SSSE3-NEXT: psrad $8, %xmm2 925; SSSE3-NEXT: psubd %xmm1, %xmm2 926; SSSE3-NEXT: movdqa %xmm2, %xmm0 927; SSSE3-NEXT: pslld $8, %xmm0 928; SSSE3-NEXT: psrad $8, %xmm0 929; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 930; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 931; SSSE3-NEXT: pxor %xmm1, %xmm0 932; SSSE3-NEXT: movd %xmm2, %eax 933; SSSE3-NEXT: movw %ax, (%rdi) 934; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 935; SSSE3-NEXT: movd %xmm1, %ecx 936; SSSE3-NEXT: movw %cx, 9(%rdi) 937; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 938; SSSE3-NEXT: movd %xmm1, %edx 939; SSSE3-NEXT: movw %dx, 6(%rdi) 940; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 941; SSSE3-NEXT: movd %xmm1, %esi 942; SSSE3-NEXT: movw %si, 3(%rdi) 943; SSSE3-NEXT: shrl $16, %eax 944; SSSE3-NEXT: movb %al, 2(%rdi) 945; SSSE3-NEXT: shrl $16, %ecx 946; SSSE3-NEXT: movb %cl, 11(%rdi) 947; SSSE3-NEXT: shrl $16, %edx 948; SSSE3-NEXT: movb %dl, 8(%rdi) 949; SSSE3-NEXT: shrl $16, %esi 950; SSSE3-NEXT: movb %sil, 5(%rdi) 951; SSSE3-NEXT: retq 952; 953; SSE41-LABEL: ssubo_v4i24: 954; SSE41: # %bb.0: 955; SSE41-NEXT: movdqa %xmm0, %xmm2 956; SSE41-NEXT: pslld $8, %xmm1 957; SSE41-NEXT: psrad $8, %xmm1 958; SSE41-NEXT: pslld $8, %xmm2 959; SSE41-NEXT: psrad $8, %xmm2 960; SSE41-NEXT: psubd %xmm1, %xmm2 961; SSE41-NEXT: movdqa %xmm2, %xmm0 962; SSE41-NEXT: pslld $8, %xmm0 963; SSE41-NEXT: psrad $8, %xmm0 964; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 965; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 966; SSE41-NEXT: pxor %xmm1, %xmm0 967; SSE41-NEXT: pextrd $3, %xmm2, %eax 968; SSE41-NEXT: movw %ax, 9(%rdi) 969; SSE41-NEXT: pextrd $2, %xmm2, %ecx 970; SSE41-NEXT: movw %cx, 6(%rdi) 971; SSE41-NEXT: pextrd $1, %xmm2, %edx 972; SSE41-NEXT: movw %dx, 3(%rdi) 973; SSE41-NEXT: movd %xmm2, %esi 974; SSE41-NEXT: movw %si, (%rdi) 975; SSE41-NEXT: shrl $16, %eax 976; SSE41-NEXT: movb %al, 11(%rdi) 977; SSE41-NEXT: shrl $16, %ecx 978; SSE41-NEXT: movb %cl, 8(%rdi) 979; SSE41-NEXT: shrl $16, %edx 980; SSE41-NEXT: movb %dl, 5(%rdi) 981; SSE41-NEXT: shrl $16, %esi 982; SSE41-NEXT: movb %sil, 2(%rdi) 983; SSE41-NEXT: retq 984; 985; AVX1-LABEL: ssubo_v4i24: 986; AVX1: # %bb.0: 987; AVX1-NEXT: vpslld $8, %xmm1, %xmm1 988; AVX1-NEXT: vpsrad $8, %xmm1, %xmm1 989; AVX1-NEXT: vpslld $8, %xmm0, %xmm0 990; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 991; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 992; AVX1-NEXT: vpslld $8, %xmm1, %xmm0 993; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 994; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 995; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 996; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 997; AVX1-NEXT: vpextrd $3, %xmm1, %eax 998; AVX1-NEXT: movw %ax, 9(%rdi) 999; AVX1-NEXT: vpextrd $2, %xmm1, %ecx 1000; AVX1-NEXT: movw %cx, 6(%rdi) 1001; AVX1-NEXT: vpextrd $1, %xmm1, %edx 1002; AVX1-NEXT: movw %dx, 3(%rdi) 1003; AVX1-NEXT: vmovd %xmm1, %esi 1004; AVX1-NEXT: movw %si, (%rdi) 1005; AVX1-NEXT: shrl $16, %eax 1006; AVX1-NEXT: movb %al, 11(%rdi) 1007; AVX1-NEXT: shrl $16, %ecx 1008; AVX1-NEXT: movb %cl, 8(%rdi) 1009; AVX1-NEXT: shrl $16, %edx 1010; AVX1-NEXT: movb %dl, 5(%rdi) 1011; AVX1-NEXT: shrl $16, %esi 1012; AVX1-NEXT: movb %sil, 2(%rdi) 1013; AVX1-NEXT: retq 1014; 1015; AVX2-LABEL: ssubo_v4i24: 1016; AVX2: # %bb.0: 1017; AVX2-NEXT: vpslld $8, %xmm1, %xmm1 1018; AVX2-NEXT: vpsrad $8, %xmm1, %xmm1 1019; AVX2-NEXT: vpslld $8, %xmm0, %xmm0 1020; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0 1021; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 1022; AVX2-NEXT: vpslld $8, %xmm1, %xmm0 1023; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0 1024; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 1025; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1026; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 1027; AVX2-NEXT: vpextrd $3, %xmm1, %eax 1028; AVX2-NEXT: movw %ax, 9(%rdi) 1029; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 1030; AVX2-NEXT: movw %cx, 6(%rdi) 1031; AVX2-NEXT: vpextrd $1, %xmm1, %edx 1032; AVX2-NEXT: movw %dx, 3(%rdi) 1033; AVX2-NEXT: vmovd %xmm1, %esi 1034; AVX2-NEXT: movw %si, (%rdi) 1035; AVX2-NEXT: shrl $16, %eax 1036; AVX2-NEXT: movb %al, 11(%rdi) 1037; AVX2-NEXT: shrl $16, %ecx 1038; AVX2-NEXT: movb %cl, 8(%rdi) 1039; AVX2-NEXT: shrl $16, %edx 1040; AVX2-NEXT: movb %dl, 5(%rdi) 1041; AVX2-NEXT: shrl $16, %esi 1042; AVX2-NEXT: movb %sil, 2(%rdi) 1043; AVX2-NEXT: retq 1044; 1045; AVX512-LABEL: ssubo_v4i24: 1046; AVX512: # %bb.0: 1047; AVX512-NEXT: vpslld $8, %xmm1, %xmm1 1048; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1 1049; AVX512-NEXT: vpslld $8, %xmm0, %xmm0 1050; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 1051; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 1052; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 1053; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 1054; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 1055; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 1056; AVX512-NEXT: vpextrd $3, %xmm1, %eax 1057; AVX512-NEXT: movw %ax, 9(%rdi) 1058; AVX512-NEXT: vpextrd $2, %xmm1, %ecx 1059; AVX512-NEXT: movw %cx, 6(%rdi) 1060; AVX512-NEXT: vpextrd $1, %xmm1, %edx 1061; AVX512-NEXT: movw %dx, 3(%rdi) 1062; AVX512-NEXT: vmovd %xmm1, %esi 1063; AVX512-NEXT: movw %si, (%rdi) 1064; AVX512-NEXT: shrl $16, %eax 1065; AVX512-NEXT: movb %al, 11(%rdi) 1066; AVX512-NEXT: shrl $16, %ecx 1067; AVX512-NEXT: movb %cl, 8(%rdi) 1068; AVX512-NEXT: shrl $16, %edx 1069; AVX512-NEXT: movb %dl, 5(%rdi) 1070; AVX512-NEXT: shrl $16, %esi 1071; AVX512-NEXT: movb %sil, 2(%rdi) 1072; AVX512-NEXT: retq 1073 %t = call {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) 1074 %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 1075 %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 1076 %res = sext <4 x i1> %obit to <4 x i32> 1077 store <4 x i24> %val, <4 x i24>* %p2 1078 ret <4 x i32> %res 1079} 1080 1081define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { 1082; SSE-LABEL: ssubo_v4i1: 1083; SSE: # %bb.0: 1084; SSE-NEXT: pslld $31, %xmm1 1085; SSE-NEXT: psrad $31, %xmm1 1086; SSE-NEXT: pslld $31, %xmm0 1087; SSE-NEXT: psrad $31, %xmm0 1088; SSE-NEXT: psubd %xmm1, %xmm0 1089; SSE-NEXT: movdqa %xmm0, %xmm1 1090; SSE-NEXT: pslld $31, %xmm1 1091; SSE-NEXT: movmskps %xmm1, %eax 1092; SSE-NEXT: psrad $31, %xmm1 1093; SSE-NEXT: pcmpeqd %xmm0, %xmm1 1094; SSE-NEXT: pcmpeqd %xmm0, %xmm0 1095; SSE-NEXT: pxor %xmm0, %xmm1 1096; SSE-NEXT: movb %al, (%rdi) 1097; SSE-NEXT: movdqa %xmm1, %xmm0 1098; SSE-NEXT: retq 1099; 1100; AVX1-LABEL: ssubo_v4i1: 1101; AVX1: # %bb.0: 1102; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 1103; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 1104; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 1105; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 1106; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1107; AVX1-NEXT: vpslld $31, %xmm0, %xmm1 1108; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 1109; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 1110; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1111; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1112; AVX1-NEXT: vmovmskps %xmm1, %eax 1113; AVX1-NEXT: movb %al, (%rdi) 1114; AVX1-NEXT: retq 1115; 1116; AVX2-LABEL: ssubo_v4i1: 1117; AVX2: # %bb.0: 1118; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 1119; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 1120; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 1121; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 1122; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1123; AVX2-NEXT: vpslld $31, %xmm0, %xmm1 1124; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 1125; AVX2-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 1126; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1127; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 1128; AVX2-NEXT: vmovmskps %xmm1, %eax 1129; AVX2-NEXT: movb %al, (%rdi) 1130; AVX2-NEXT: retq 1131; 1132; AVX512-LABEL: ssubo_v4i1: 1133; AVX512: # %bb.0: 1134; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 1135; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 1136; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 1137; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 1138; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k2 {%k1} 1139; AVX512-NEXT: kxorw %k0, %k1, %k0 1140; AVX512-NEXT: kxorw %k2, %k0, %k1 1141; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1142; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1143; AVX512-NEXT: kshiftlw $12, %k0, %k0 1144; AVX512-NEXT: kshiftrw $12, %k0, %k0 1145; AVX512-NEXT: kmovd %k0, %eax 1146; AVX512-NEXT: movb %al, (%rdi) 1147; AVX512-NEXT: retq 1148 %t = call {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) 1149 %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 1150 %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 1151 %res = sext <4 x i1> %obit to <4 x i32> 1152 store <4 x i1> %val, <4 x i1>* %p2 1153 ret <4 x i32> %res 1154} 1155 1156define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { 1157; SSE2-LABEL: ssubo_v2i128: 1158; SSE2: # %bb.0: 1159; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1160; SSE2-NEXT: subq %r8, %rdi 1161; SSE2-NEXT: sbbq %r9, %rsi 1162; SSE2-NEXT: seto %r8b 1163; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1164; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1165; SSE2-NEXT: seto %al 1166; SSE2-NEXT: movzbl %al, %eax 1167; SSE2-NEXT: negl %eax 1168; SSE2-NEXT: movd %eax, %xmm1 1169; SSE2-NEXT: movzbl %r8b, %eax 1170; SSE2-NEXT: negl %eax 1171; SSE2-NEXT: movd %eax, %xmm0 1172; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1173; SSE2-NEXT: movq %rdx, 16(%r10) 1174; SSE2-NEXT: movq %rdi, (%r10) 1175; SSE2-NEXT: movq %rcx, 24(%r10) 1176; SSE2-NEXT: movq %rsi, 8(%r10) 1177; SSE2-NEXT: retq 1178; 1179; SSSE3-LABEL: ssubo_v2i128: 1180; SSSE3: # %bb.0: 1181; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 1182; SSSE3-NEXT: subq %r8, %rdi 1183; SSSE3-NEXT: sbbq %r9, %rsi 1184; SSSE3-NEXT: seto %r8b 1185; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1186; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1187; SSSE3-NEXT: seto %al 1188; SSSE3-NEXT: movzbl %al, %eax 1189; SSSE3-NEXT: negl %eax 1190; SSSE3-NEXT: movd %eax, %xmm1 1191; SSSE3-NEXT: movzbl %r8b, %eax 1192; SSSE3-NEXT: negl %eax 1193; SSSE3-NEXT: movd %eax, %xmm0 1194; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1195; SSSE3-NEXT: movq %rdx, 16(%r10) 1196; SSSE3-NEXT: movq %rdi, (%r10) 1197; SSSE3-NEXT: movq %rcx, 24(%r10) 1198; SSSE3-NEXT: movq %rsi, 8(%r10) 1199; SSSE3-NEXT: retq 1200; 1201; SSE41-LABEL: ssubo_v2i128: 1202; SSE41: # %bb.0: 1203; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 1204; SSE41-NEXT: subq %r8, %rdi 1205; SSE41-NEXT: sbbq %r9, %rsi 1206; SSE41-NEXT: seto %r8b 1207; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1208; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1209; SSE41-NEXT: seto %al 1210; SSE41-NEXT: movzbl %al, %r9d 1211; SSE41-NEXT: negl %r9d 1212; SSE41-NEXT: movzbl %r8b, %eax 1213; SSE41-NEXT: negl %eax 1214; SSE41-NEXT: movd %eax, %xmm0 1215; SSE41-NEXT: pinsrd $1, %r9d, %xmm0 1216; SSE41-NEXT: movq %rdx, 16(%r10) 1217; SSE41-NEXT: movq %rdi, (%r10) 1218; SSE41-NEXT: movq %rcx, 24(%r10) 1219; SSE41-NEXT: movq %rsi, 8(%r10) 1220; SSE41-NEXT: retq 1221; 1222; AVX1-LABEL: ssubo_v2i128: 1223; AVX1: # %bb.0: 1224; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 1225; AVX1-NEXT: subq %r8, %rdi 1226; AVX1-NEXT: sbbq %r9, %rsi 1227; AVX1-NEXT: seto %r8b 1228; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1229; AVX1-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1230; AVX1-NEXT: seto %al 1231; AVX1-NEXT: movzbl %al, %r9d 1232; AVX1-NEXT: negl %r9d 1233; AVX1-NEXT: movzbl %r8b, %eax 1234; AVX1-NEXT: negl %eax 1235; AVX1-NEXT: vmovd %eax, %xmm0 1236; AVX1-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 1237; AVX1-NEXT: movq %rdx, 16(%r10) 1238; AVX1-NEXT: movq %rdi, (%r10) 1239; AVX1-NEXT: movq %rcx, 24(%r10) 1240; AVX1-NEXT: movq %rsi, 8(%r10) 1241; AVX1-NEXT: retq 1242; 1243; AVX2-LABEL: ssubo_v2i128: 1244; AVX2: # %bb.0: 1245; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1246; AVX2-NEXT: subq %r8, %rdi 1247; AVX2-NEXT: sbbq %r9, %rsi 1248; AVX2-NEXT: seto %r8b 1249; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1250; AVX2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1251; AVX2-NEXT: seto %al 1252; AVX2-NEXT: movzbl %al, %r9d 1253; AVX2-NEXT: negl %r9d 1254; AVX2-NEXT: movzbl %r8b, %eax 1255; AVX2-NEXT: negl %eax 1256; AVX2-NEXT: vmovd %eax, %xmm0 1257; AVX2-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 1258; AVX2-NEXT: movq %rdx, 16(%r10) 1259; AVX2-NEXT: movq %rdi, (%r10) 1260; AVX2-NEXT: movq %rcx, 24(%r10) 1261; AVX2-NEXT: movq %rsi, 8(%r10) 1262; AVX2-NEXT: retq 1263; 1264; AVX512-LABEL: ssubo_v2i128: 1265; AVX512: # %bb.0: 1266; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1267; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx 1268; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx 1269; AVX512-NEXT: seto %al 1270; AVX512-NEXT: kmovd %eax, %k0 1271; AVX512-NEXT: subq %r8, %rdi 1272; AVX512-NEXT: sbbq %r9, %rsi 1273; AVX512-NEXT: seto %al 1274; AVX512-NEXT: andl $1, %eax 1275; AVX512-NEXT: kmovw %eax, %k1 1276; AVX512-NEXT: kshiftlw $1, %k0, %k0 1277; AVX512-NEXT: korw %k0, %k1, %k1 1278; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1279; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1280; AVX512-NEXT: movq %rdx, 16(%r10) 1281; AVX512-NEXT: movq %rdi, (%r10) 1282; AVX512-NEXT: movq %rcx, 24(%r10) 1283; AVX512-NEXT: movq %rsi, 8(%r10) 1284; AVX512-NEXT: retq 1285 %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) 1286 %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 1287 %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 1288 %res = sext <2 x i1> %obit to <2 x i32> 1289 store <2 x i128> %val, <2 x i128>* %p2 1290 ret <2 x i32> %res 1291} 1292