1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
8
9declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>)
10declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>)
11declare {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32>, <3 x i32>)
12declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
13declare {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32>, <6 x i32>)
14declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
15declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
16
17declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
18declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
19declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
20
21declare {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24>, <4 x i24>)
22declare {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1>, <4 x i1>)
23declare {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128>, <2 x i128>)
24
25define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
26; SSE-LABEL: ssubo_v1i32:
27; SSE:       # %bb.0:
28; SSE-NEXT:    xorl %eax, %eax
29; SSE-NEXT:    subl %esi, %edi
30; SSE-NEXT:    seto %al
31; SSE-NEXT:    negl %eax
32; SSE-NEXT:    movl %edi, (%rdx)
33; SSE-NEXT:    retq
34;
35; AVX-LABEL: ssubo_v1i32:
36; AVX:       # %bb.0:
37; AVX-NEXT:    xorl %eax, %eax
38; AVX-NEXT:    subl %esi, %edi
39; AVX-NEXT:    seto %al
40; AVX-NEXT:    negl %eax
41; AVX-NEXT:    movl %edi, (%rdx)
42; AVX-NEXT:    retq
43  %t = call {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
44  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
45  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
46  %res = sext <1 x i1> %obit to <1 x i32>
47  store <1 x i32> %val, <1 x i32>* %p2
48  ret <1 x i32> %res
49}
50
51define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
52; SSE-LABEL: ssubo_v2i32:
53; SSE:       # %bb.0:
54; SSE-NEXT:    pxor %xmm2, %xmm2
55; SSE-NEXT:    movdqa %xmm0, %xmm3
56; SSE-NEXT:    psubd %xmm1, %xmm3
57; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
58; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
59; SSE-NEXT:    pxor %xmm1, %xmm0
60; SSE-NEXT:    movq %xmm3, (%rdi)
61; SSE-NEXT:    retq
62;
63; AVX1-LABEL: ssubo_v2i32:
64; AVX1:       # %bb.0:
65; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
66; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
67; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
68; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
69; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
70; AVX1-NEXT:    vmovq %xmm1, (%rdi)
71; AVX1-NEXT:    retq
72;
73; AVX2-LABEL: ssubo_v2i32:
74; AVX2:       # %bb.0:
75; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
76; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
77; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
78; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
79; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
80; AVX2-NEXT:    vmovq %xmm1, (%rdi)
81; AVX2-NEXT:    retq
82;
83; AVX512-LABEL: ssubo_v2i32:
84; AVX512:       # %bb.0:
85; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
86; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
87; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
88; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
89; AVX512-NEXT:    kxorw %k1, %k0, %k1
90; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
91; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
92; AVX512-NEXT:    vmovq %xmm1, (%rdi)
93; AVX512-NEXT:    retq
94  %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
95  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
96  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
97  %res = sext <2 x i1> %obit to <2 x i32>
98  store <2 x i32> %val, <2 x i32>* %p2
99  ret <2 x i32> %res
100}
101
102define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
103; SSE2-LABEL: ssubo_v3i32:
104; SSE2:       # %bb.0:
105; SSE2-NEXT:    pxor %xmm2, %xmm2
106; SSE2-NEXT:    movdqa %xmm0, %xmm3
107; SSE2-NEXT:    psubd %xmm1, %xmm3
108; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
109; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
110; SSE2-NEXT:    pxor %xmm1, %xmm0
111; SSE2-NEXT:    movq %xmm3, (%rdi)
112; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
113; SSE2-NEXT:    movd %xmm1, 8(%rdi)
114; SSE2-NEXT:    retq
115;
116; SSSE3-LABEL: ssubo_v3i32:
117; SSSE3:       # %bb.0:
118; SSSE3-NEXT:    pxor %xmm2, %xmm2
119; SSSE3-NEXT:    movdqa %xmm0, %xmm3
120; SSSE3-NEXT:    psubd %xmm1, %xmm3
121; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
122; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
123; SSSE3-NEXT:    pxor %xmm1, %xmm0
124; SSSE3-NEXT:    movq %xmm3, (%rdi)
125; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
126; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
127; SSSE3-NEXT:    retq
128;
129; SSE41-LABEL: ssubo_v3i32:
130; SSE41:       # %bb.0:
131; SSE41-NEXT:    pxor %xmm2, %xmm2
132; SSE41-NEXT:    movdqa %xmm0, %xmm3
133; SSE41-NEXT:    psubd %xmm1, %xmm3
134; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
135; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
136; SSE41-NEXT:    pxor %xmm1, %xmm0
137; SSE41-NEXT:    pextrd $2, %xmm3, 8(%rdi)
138; SSE41-NEXT:    movq %xmm3, (%rdi)
139; SSE41-NEXT:    retq
140;
141; AVX1-LABEL: ssubo_v3i32:
142; AVX1:       # %bb.0:
143; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
144; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
145; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
146; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
147; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
148; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
149; AVX1-NEXT:    vmovq %xmm1, (%rdi)
150; AVX1-NEXT:    retq
151;
152; AVX2-LABEL: ssubo_v3i32:
153; AVX2:       # %bb.0:
154; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
155; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
156; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
157; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
158; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
159; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
160; AVX2-NEXT:    vmovq %xmm1, (%rdi)
161; AVX2-NEXT:    retq
162;
163; AVX512-LABEL: ssubo_v3i32:
164; AVX512:       # %bb.0:
165; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
166; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
167; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
168; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
169; AVX512-NEXT:    kxorw %k1, %k0, %k1
170; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
171; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
172; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
173; AVX512-NEXT:    vmovq %xmm1, (%rdi)
174; AVX512-NEXT:    retq
175  %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
176  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
177  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
178  %res = sext <3 x i1> %obit to <3 x i32>
179  store <3 x i32> %val, <3 x i32>* %p2
180  ret <3 x i32> %res
181}
182
183define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
184; SSE-LABEL: ssubo_v4i32:
185; SSE:       # %bb.0:
186; SSE-NEXT:    pxor %xmm2, %xmm2
187; SSE-NEXT:    movdqa %xmm0, %xmm3
188; SSE-NEXT:    psubd %xmm1, %xmm3
189; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
190; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
191; SSE-NEXT:    pxor %xmm1, %xmm0
192; SSE-NEXT:    movdqa %xmm3, (%rdi)
193; SSE-NEXT:    retq
194;
195; AVX1-LABEL: ssubo_v4i32:
196; AVX1:       # %bb.0:
197; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
198; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
199; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
200; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
201; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
202; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
203; AVX1-NEXT:    retq
204;
205; AVX2-LABEL: ssubo_v4i32:
206; AVX2:       # %bb.0:
207; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
208; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm2
209; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
210; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
211; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
212; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
213; AVX2-NEXT:    retq
214;
215; AVX512-LABEL: ssubo_v4i32:
216; AVX512:       # %bb.0:
217; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
218; AVX512-NEXT:    vpcmpgtd %xmm2, %xmm1, %k0
219; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
220; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
221; AVX512-NEXT:    kxorw %k1, %k0, %k1
222; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
223; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
224; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
225; AVX512-NEXT:    retq
226  %t = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
227  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
228  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
229  %res = sext <4 x i1> %obit to <4 x i32>
230  store <4 x i32> %val, <4 x i32>* %p2
231  ret <4 x i32> %res
232}
233
234define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
235; SSE2-LABEL: ssubo_v6i32:
236; SSE2:       # %bb.0:
237; SSE2-NEXT:    movq %rdi, %rax
238; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
239; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
240; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
241; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
242; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
243; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
244; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
245; SSE2-NEXT:    movd %r8d, %xmm1
246; SSE2-NEXT:    movd %ecx, %xmm2
247; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
248; SSE2-NEXT:    movd %edx, %xmm1
249; SSE2-NEXT:    movd %esi, %xmm3
250; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
251; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
252; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
253; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
254; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
255; SSE2-NEXT:    movd %r9d, %xmm1
256; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
257; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
258; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
259; SSE2-NEXT:    movdqa %xmm3, %xmm4
260; SSE2-NEXT:    psubd %xmm0, %xmm4
261; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
262; SSE2-NEXT:    pxor %xmm5, %xmm5
263; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
264; SSE2-NEXT:    pxor %xmm3, %xmm0
265; SSE2-NEXT:    movdqa %xmm1, %xmm3
266; SSE2-NEXT:    psubd %xmm2, %xmm3
267; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
268; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
269; SSE2-NEXT:    pxor %xmm1, %xmm2
270; SSE2-NEXT:    movq %xmm3, 16(%rcx)
271; SSE2-NEXT:    movdqa %xmm4, (%rcx)
272; SSE2-NEXT:    movq %xmm2, 16(%rdi)
273; SSE2-NEXT:    movdqa %xmm0, (%rdi)
274; SSE2-NEXT:    retq
275;
276; SSSE3-LABEL: ssubo_v6i32:
277; SSSE3:       # %bb.0:
278; SSSE3-NEXT:    movq %rdi, %rax
279; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
280; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
281; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
282; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
283; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
284; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
285; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
286; SSSE3-NEXT:    movd %r8d, %xmm1
287; SSSE3-NEXT:    movd %ecx, %xmm2
288; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
289; SSSE3-NEXT:    movd %edx, %xmm1
290; SSSE3-NEXT:    movd %esi, %xmm3
291; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
292; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
293; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
294; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
295; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
296; SSSE3-NEXT:    movd %r9d, %xmm1
297; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
298; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
299; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
300; SSSE3-NEXT:    movdqa %xmm3, %xmm4
301; SSSE3-NEXT:    psubd %xmm0, %xmm4
302; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
303; SSSE3-NEXT:    pxor %xmm5, %xmm5
304; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
305; SSSE3-NEXT:    pxor %xmm3, %xmm0
306; SSSE3-NEXT:    movdqa %xmm1, %xmm3
307; SSSE3-NEXT:    psubd %xmm2, %xmm3
308; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
309; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
310; SSSE3-NEXT:    pxor %xmm1, %xmm2
311; SSSE3-NEXT:    movq %xmm3, 16(%rcx)
312; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
313; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
314; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
315; SSSE3-NEXT:    retq
316;
317; SSE41-LABEL: ssubo_v6i32:
318; SSE41:       # %bb.0:
319; SSE41-NEXT:    movq %rdi, %rax
320; SSE41-NEXT:    movd %esi, %xmm1
321; SSE41-NEXT:    pinsrd $1, %edx, %xmm1
322; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
323; SSE41-NEXT:    pinsrd $3, %r8d, %xmm1
324; SSE41-NEXT:    movd %r9d, %xmm0
325; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
326; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
327; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
328; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
329; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
330; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
331; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
332; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
333; SSE41-NEXT:    movdqa %xmm1, %xmm4
334; SSE41-NEXT:    psubd %xmm3, %xmm4
335; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
336; SSE41-NEXT:    pxor %xmm5, %xmm5
337; SSE41-NEXT:    pcmpgtd %xmm5, %xmm3
338; SSE41-NEXT:    pxor %xmm1, %xmm3
339; SSE41-NEXT:    movdqa %xmm0, %xmm1
340; SSE41-NEXT:    psubd %xmm2, %xmm1
341; SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
342; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
343; SSE41-NEXT:    pxor %xmm2, %xmm0
344; SSE41-NEXT:    movq %xmm1, 16(%rcx)
345; SSE41-NEXT:    movdqa %xmm4, (%rcx)
346; SSE41-NEXT:    movq %xmm0, 16(%rdi)
347; SSE41-NEXT:    movdqa %xmm3, (%rdi)
348; SSE41-NEXT:    retq
349;
350; AVX1-LABEL: ssubo_v6i32:
351; AVX1:       # %bb.0:
352; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
353; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
354; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
355; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
356; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
357; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
358; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
359; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
360; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
361; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
362; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
363; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
364; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
365; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
366; AVX1-NEXT:    retq
367;
368; AVX2-LABEL: ssubo_v6i32:
369; AVX2:       # %bb.0:
370; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
371; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
372; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
373; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
374; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
375; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
376; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
377; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
378; AVX2-NEXT:    retq
379;
380; AVX512-LABEL: ssubo_v6i32:
381; AVX512:       # %bb.0:
382; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
383; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
384; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
385; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
386; AVX512-NEXT:    kxorw %k1, %k0, %k1
387; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
388; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
389; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
390; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
391; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
392; AVX512-NEXT:    retq
393  %t = call {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
394  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
395  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
396  %res = sext <6 x i1> %obit to <6 x i32>
397  store <6 x i32> %val, <6 x i32>* %p2
398  ret <6 x i32> %res
399}
400
401define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
402; SSE-LABEL: ssubo_v8i32:
403; SSE:       # %bb.0:
404; SSE-NEXT:    pxor %xmm4, %xmm4
405; SSE-NEXT:    movdqa %xmm0, %xmm5
406; SSE-NEXT:    psubd %xmm2, %xmm5
407; SSE-NEXT:    pcmpgtd %xmm4, %xmm2
408; SSE-NEXT:    pcmpgtd %xmm5, %xmm0
409; SSE-NEXT:    pxor %xmm2, %xmm0
410; SSE-NEXT:    movdqa %xmm1, %xmm2
411; SSE-NEXT:    psubd %xmm3, %xmm2
412; SSE-NEXT:    pcmpgtd %xmm4, %xmm3
413; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
414; SSE-NEXT:    pxor %xmm3, %xmm1
415; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
416; SSE-NEXT:    movdqa %xmm5, (%rdi)
417; SSE-NEXT:    retq
418;
419; AVX1-LABEL: ssubo_v8i32:
420; AVX1:       # %bb.0:
421; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
422; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
423; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm4
424; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm3
425; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
426; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
427; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
428; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
429; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
430; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
431; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
432; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
433; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
434; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
435; AVX1-NEXT:    retq
436;
437; AVX2-LABEL: ssubo_v8i32:
438; AVX2:       # %bb.0:
439; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
440; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm2
441; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
442; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
443; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
444; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
445; AVX2-NEXT:    retq
446;
447; AVX512-LABEL: ssubo_v8i32:
448; AVX512:       # %bb.0:
449; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
450; AVX512-NEXT:    vpcmpgtd %ymm2, %ymm1, %k0
451; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
452; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
453; AVX512-NEXT:    kxorw %k1, %k0, %k1
454; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
455; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
456; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
457; AVX512-NEXT:    retq
458  %t = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
459  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
460  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
461  %res = sext <8 x i1> %obit to <8 x i32>
462  store <8 x i32> %val, <8 x i32>* %p2
463  ret <8 x i32> %res
464}
465
466define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
467; SSE-LABEL: ssubo_v16i32:
468; SSE:       # %bb.0:
469; SSE-NEXT:    pxor %xmm9, %xmm9
470; SSE-NEXT:    movdqa %xmm0, %xmm8
471; SSE-NEXT:    psubd %xmm4, %xmm8
472; SSE-NEXT:    pcmpgtd %xmm9, %xmm4
473; SSE-NEXT:    pcmpgtd %xmm8, %xmm0
474; SSE-NEXT:    pxor %xmm4, %xmm0
475; SSE-NEXT:    movdqa %xmm1, %xmm4
476; SSE-NEXT:    psubd %xmm5, %xmm4
477; SSE-NEXT:    pcmpgtd %xmm9, %xmm5
478; SSE-NEXT:    pcmpgtd %xmm4, %xmm1
479; SSE-NEXT:    pxor %xmm5, %xmm1
480; SSE-NEXT:    movdqa %xmm2, %xmm5
481; SSE-NEXT:    psubd %xmm6, %xmm5
482; SSE-NEXT:    pcmpgtd %xmm9, %xmm6
483; SSE-NEXT:    pcmpgtd %xmm5, %xmm2
484; SSE-NEXT:    pxor %xmm6, %xmm2
485; SSE-NEXT:    movdqa %xmm3, %xmm6
486; SSE-NEXT:    psubd %xmm7, %xmm6
487; SSE-NEXT:    pcmpgtd %xmm9, %xmm7
488; SSE-NEXT:    pcmpgtd %xmm6, %xmm3
489; SSE-NEXT:    pxor %xmm7, %xmm3
490; SSE-NEXT:    movdqa %xmm6, 48(%rdi)
491; SSE-NEXT:    movdqa %xmm5, 32(%rdi)
492; SSE-NEXT:    movdqa %xmm4, 16(%rdi)
493; SSE-NEXT:    movdqa %xmm8, (%rdi)
494; SSE-NEXT:    retq
495;
496; AVX1-LABEL: ssubo_v16i32:
497; AVX1:       # %bb.0:
498; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
499; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
500; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm4, %xmm6
501; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
502; AVX1-NEXT:    vpsubd %xmm4, %xmm7, %xmm8
503; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm7, %xmm7
504; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
505; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm7
506; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
507; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
508; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
509; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm1
510; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
511; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm6, %xmm7
512; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
513; AVX1-NEXT:    vpsubd %xmm6, %xmm4, %xmm6
514; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
515; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm4
516; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm5
517; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
518; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
519; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
520; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
521; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
522; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm4
523; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
524; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
525; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
526; AVX1-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
527; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm4
528; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
529; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
530; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
531; AVX1-NEXT:    vmovdqa %xmm8, 48(%rdi)
532; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
533; AVX1-NEXT:    vmovdqa %xmm6, 16(%rdi)
534; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
535; AVX1-NEXT:    retq
536;
537; AVX2-LABEL: ssubo_v16i32:
538; AVX2:       # %bb.0:
539; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
540; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm5
541; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
542; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
543; AVX2-NEXT:    vpxor %ymm1, %ymm5, %ymm1
544; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
545; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
546; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm2, %ymm4
547; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
548; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
549; AVX2-NEXT:    vpxor %ymm0, %ymm4, %ymm0
550; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
551; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
552; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
553; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
554; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
555; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
556; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
557; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
558; AVX2-NEXT:    retq
559;
560; AVX512-LABEL: ssubo_v16i32:
561; AVX512:       # %bb.0:
562; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
563; AVX512-NEXT:    vpcmpgtd %zmm2, %zmm1, %k0
564; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
565; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
566; AVX512-NEXT:    kxorw %k1, %k0, %k1
567; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
568; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
569; AVX512-NEXT:    retq
570  %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
571  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
572  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
573  %res = sext <16 x i1> %obit to <16 x i32>
574  store <16 x i32> %val, <16 x i32>* %p2
575  ret <16 x i32> %res
576}
577
578define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
579; SSE2-LABEL: ssubo_v16i8:
580; SSE2:       # %bb.0:
581; SSE2-NEXT:    movdqa %xmm0, %xmm2
582; SSE2-NEXT:    psubsb %xmm1, %xmm2
583; SSE2-NEXT:    psubb %xmm1, %xmm0
584; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
585; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
586; SSE2-NEXT:    pxor %xmm2, %xmm3
587; SSE2-NEXT:    movdqa %xmm3, %xmm1
588; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
589; SSE2-NEXT:    movdqa %xmm1, %xmm4
590; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
591; SSE2-NEXT:    pslld $31, %xmm4
592; SSE2-NEXT:    psrad $31, %xmm4
593; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
594; SSE2-NEXT:    pslld $31, %xmm1
595; SSE2-NEXT:    psrad $31, %xmm1
596; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
597; SSE2-NEXT:    movdqa %xmm3, %xmm2
598; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
599; SSE2-NEXT:    pslld $31, %xmm2
600; SSE2-NEXT:    psrad $31, %xmm2
601; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
602; SSE2-NEXT:    pslld $31, %xmm3
603; SSE2-NEXT:    psrad $31, %xmm3
604; SSE2-NEXT:    movdqa %xmm0, (%rdi)
605; SSE2-NEXT:    movdqa %xmm4, %xmm0
606; SSE2-NEXT:    retq
607;
608; SSSE3-LABEL: ssubo_v16i8:
609; SSSE3:       # %bb.0:
610; SSSE3-NEXT:    movdqa %xmm0, %xmm2
611; SSSE3-NEXT:    psubsb %xmm1, %xmm2
612; SSSE3-NEXT:    psubb %xmm1, %xmm0
613; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
614; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
615; SSSE3-NEXT:    pxor %xmm2, %xmm3
616; SSSE3-NEXT:    movdqa %xmm3, %xmm1
617; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
618; SSSE3-NEXT:    movdqa %xmm1, %xmm4
619; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
620; SSSE3-NEXT:    pslld $31, %xmm4
621; SSSE3-NEXT:    psrad $31, %xmm4
622; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
623; SSSE3-NEXT:    pslld $31, %xmm1
624; SSSE3-NEXT:    psrad $31, %xmm1
625; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
626; SSSE3-NEXT:    movdqa %xmm3, %xmm2
627; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
628; SSSE3-NEXT:    pslld $31, %xmm2
629; SSSE3-NEXT:    psrad $31, %xmm2
630; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
631; SSSE3-NEXT:    pslld $31, %xmm3
632; SSSE3-NEXT:    psrad $31, %xmm3
633; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
634; SSSE3-NEXT:    movdqa %xmm4, %xmm0
635; SSSE3-NEXT:    retq
636;
637; SSE41-LABEL: ssubo_v16i8:
638; SSE41:       # %bb.0:
639; SSE41-NEXT:    movdqa %xmm0, %xmm2
640; SSE41-NEXT:    psubsb %xmm1, %xmm2
641; SSE41-NEXT:    psubb %xmm1, %xmm0
642; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
643; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
644; SSE41-NEXT:    pxor %xmm2, %xmm3
645; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
646; SSE41-NEXT:    pslld $31, %xmm4
647; SSE41-NEXT:    psrad $31, %xmm4
648; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
649; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
650; SSE41-NEXT:    pslld $31, %xmm1
651; SSE41-NEXT:    psrad $31, %xmm1
652; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
653; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
654; SSE41-NEXT:    pslld $31, %xmm2
655; SSE41-NEXT:    psrad $31, %xmm2
656; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
657; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
658; SSE41-NEXT:    pslld $31, %xmm3
659; SSE41-NEXT:    psrad $31, %xmm3
660; SSE41-NEXT:    movdqa %xmm0, (%rdi)
661; SSE41-NEXT:    movdqa %xmm4, %xmm0
662; SSE41-NEXT:    retq
663;
664; AVX1-LABEL: ssubo_v16i8:
665; AVX1:       # %bb.0:
666; AVX1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
667; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm3
668; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
669; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
670; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
671; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
672; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
673; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
674; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
675; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
676; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
677; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
678; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
679; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
680; AVX1-NEXT:    vmovdqa %xmm3, (%rdi)
681; AVX1-NEXT:    retq
682;
683; AVX2-LABEL: ssubo_v16i8:
684; AVX2:       # %bb.0:
685; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
686; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm3
687; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
688; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
689; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
690; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
691; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
692; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
693; AVX2-NEXT:    vmovdqa %xmm3, (%rdi)
694; AVX2-NEXT:    retq
695;
696; AVX512-LABEL: ssubo_v16i8:
697; AVX512:       # %bb.0:
698; AVX512-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
699; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
700; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
701; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
702; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
703; AVX512-NEXT:    retq
704  %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
705  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
706  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
707  %res = sext <16 x i1> %obit to <16 x i32>
708  store <16 x i8> %val, <16 x i8>* %p2
709  ret <16 x i32> %res
710}
711
712define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
713; SSE2-LABEL: ssubo_v8i16:
714; SSE2:       # %bb.0:
715; SSE2-NEXT:    movdqa %xmm0, %xmm2
716; SSE2-NEXT:    psubsw %xmm1, %xmm2
717; SSE2-NEXT:    psubw %xmm1, %xmm0
718; SSE2-NEXT:    pcmpeqw %xmm0, %xmm2
719; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
720; SSE2-NEXT:    pxor %xmm2, %xmm1
721; SSE2-NEXT:    movdqa %xmm1, %xmm2
722; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
723; SSE2-NEXT:    pslld $31, %xmm2
724; SSE2-NEXT:    psrad $31, %xmm2
725; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
726; SSE2-NEXT:    pslld $31, %xmm1
727; SSE2-NEXT:    psrad $31, %xmm1
728; SSE2-NEXT:    movdqa %xmm0, (%rdi)
729; SSE2-NEXT:    movdqa %xmm2, %xmm0
730; SSE2-NEXT:    retq
731;
732; SSSE3-LABEL: ssubo_v8i16:
733; SSSE3:       # %bb.0:
734; SSSE3-NEXT:    movdqa %xmm0, %xmm2
735; SSSE3-NEXT:    psubsw %xmm1, %xmm2
736; SSSE3-NEXT:    psubw %xmm1, %xmm0
737; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm2
738; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
739; SSSE3-NEXT:    pxor %xmm2, %xmm1
740; SSSE3-NEXT:    movdqa %xmm1, %xmm2
741; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
742; SSSE3-NEXT:    pslld $31, %xmm2
743; SSSE3-NEXT:    psrad $31, %xmm2
744; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
745; SSSE3-NEXT:    pslld $31, %xmm1
746; SSSE3-NEXT:    psrad $31, %xmm1
747; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
748; SSSE3-NEXT:    movdqa %xmm2, %xmm0
749; SSSE3-NEXT:    retq
750;
751; SSE41-LABEL: ssubo_v8i16:
752; SSE41:       # %bb.0:
753; SSE41-NEXT:    movdqa %xmm0, %xmm2
754; SSE41-NEXT:    psubsw %xmm1, %xmm2
755; SSE41-NEXT:    psubw %xmm1, %xmm0
756; SSE41-NEXT:    pcmpeqw %xmm0, %xmm2
757; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
758; SSE41-NEXT:    pxor %xmm2, %xmm1
759; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
760; SSE41-NEXT:    pslld $31, %xmm2
761; SSE41-NEXT:    psrad $31, %xmm2
762; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
763; SSE41-NEXT:    pslld $31, %xmm1
764; SSE41-NEXT:    psrad $31, %xmm1
765; SSE41-NEXT:    movdqa %xmm0, (%rdi)
766; SSE41-NEXT:    movdqa %xmm2, %xmm0
767; SSE41-NEXT:    retq
768;
769; AVX1-LABEL: ssubo_v8i16:
770; AVX1:       # %bb.0:
771; AVX1-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
772; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
773; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
774; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
775; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
776; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
777; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
778; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
779; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
780; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
781; AVX1-NEXT:    retq
782;
783; AVX2-LABEL: ssubo_v8i16:
784; AVX2:       # %bb.0:
785; AVX2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
786; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
787; AVX2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
788; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
789; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
790; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
791; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
792; AVX2-NEXT:    retq
793;
794; AVX512-LABEL: ssubo_v8i16:
795; AVX512:       # %bb.0:
796; AVX512-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2
797; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
798; AVX512-NEXT:    vpcmpneqw %xmm2, %xmm1, %k1
799; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
800; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
801; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
802; AVX512-NEXT:    retq
803  %t = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
804  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
805  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
806  %res = sext <8 x i1> %obit to <8 x i32>
807  store <8 x i16> %val, <8 x i16>* %p2
808  ret <8 x i32> %res
809}
810
811define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
812; SSE-LABEL: ssubo_v2i64:
813; SSE:       # %bb.0:
814; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
815; SSE-NEXT:    movdqa %xmm0, %xmm3
816; SSE-NEXT:    pxor %xmm2, %xmm3
817; SSE-NEXT:    psubq %xmm1, %xmm0
818; SSE-NEXT:    movdqa %xmm0, (%rdi)
819; SSE-NEXT:    pxor %xmm2, %xmm0
820; SSE-NEXT:    movdqa %xmm3, %xmm4
821; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
822; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
823; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
824; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
825; SSE-NEXT:    pand %xmm5, %xmm0
826; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
827; SSE-NEXT:    por %xmm0, %xmm3
828; SSE-NEXT:    pxor %xmm2, %xmm1
829; SSE-NEXT:    movdqa %xmm1, %xmm0
830; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
831; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
832; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
833; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
834; SSE-NEXT:    pand %xmm4, %xmm1
835; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
836; SSE-NEXT:    por %xmm1, %xmm0
837; SSE-NEXT:    pxor %xmm3, %xmm0
838; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
839; SSE-NEXT:    retq
840;
841; AVX1-LABEL: ssubo_v2i64:
842; AVX1:       # %bb.0:
843; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
844; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
845; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
846; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
847; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
848; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
849; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
850; AVX1-NEXT:    retq
851;
852; AVX2-LABEL: ssubo_v2i64:
853; AVX2:       # %bb.0:
854; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
855; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
856; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
857; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
858; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
859; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
860; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
861; AVX2-NEXT:    retq
862;
863; AVX512-LABEL: ssubo_v2i64:
864; AVX512:       # %bb.0:
865; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
866; AVX512-NEXT:    vpcmpgtq %xmm2, %xmm1, %k0
867; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
868; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
869; AVX512-NEXT:    kxorw %k1, %k0, %k1
870; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
871; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
872; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
873; AVX512-NEXT:    retq
874  %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
875  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
876  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
877  %res = sext <2 x i1> %obit to <2 x i32>
878  store <2 x i64> %val, <2 x i64>* %p2
879  ret <2 x i32> %res
880}
881
882define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
883; SSE2-LABEL: ssubo_v4i24:
884; SSE2:       # %bb.0:
885; SSE2-NEXT:    movdqa %xmm0, %xmm2
886; SSE2-NEXT:    pslld $8, %xmm1
887; SSE2-NEXT:    psrad $8, %xmm1
888; SSE2-NEXT:    pslld $8, %xmm2
889; SSE2-NEXT:    psrad $8, %xmm2
890; SSE2-NEXT:    psubd %xmm1, %xmm2
891; SSE2-NEXT:    movdqa %xmm2, %xmm0
892; SSE2-NEXT:    pslld $8, %xmm0
893; SSE2-NEXT:    psrad $8, %xmm0
894; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
895; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
896; SSE2-NEXT:    pxor %xmm1, %xmm0
897; SSE2-NEXT:    movd %xmm2, %eax
898; SSE2-NEXT:    movw %ax, (%rdi)
899; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
900; SSE2-NEXT:    movd %xmm1, %ecx
901; SSE2-NEXT:    movw %cx, 9(%rdi)
902; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
903; SSE2-NEXT:    movd %xmm1, %edx
904; SSE2-NEXT:    movw %dx, 6(%rdi)
905; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
906; SSE2-NEXT:    movd %xmm1, %esi
907; SSE2-NEXT:    movw %si, 3(%rdi)
908; SSE2-NEXT:    shrl $16, %eax
909; SSE2-NEXT:    movb %al, 2(%rdi)
910; SSE2-NEXT:    shrl $16, %ecx
911; SSE2-NEXT:    movb %cl, 11(%rdi)
912; SSE2-NEXT:    shrl $16, %edx
913; SSE2-NEXT:    movb %dl, 8(%rdi)
914; SSE2-NEXT:    shrl $16, %esi
915; SSE2-NEXT:    movb %sil, 5(%rdi)
916; SSE2-NEXT:    retq
917;
918; SSSE3-LABEL: ssubo_v4i24:
919; SSSE3:       # %bb.0:
920; SSSE3-NEXT:    movdqa %xmm0, %xmm2
921; SSSE3-NEXT:    pslld $8, %xmm1
922; SSSE3-NEXT:    psrad $8, %xmm1
923; SSSE3-NEXT:    pslld $8, %xmm2
924; SSSE3-NEXT:    psrad $8, %xmm2
925; SSSE3-NEXT:    psubd %xmm1, %xmm2
926; SSSE3-NEXT:    movdqa %xmm2, %xmm0
927; SSSE3-NEXT:    pslld $8, %xmm0
928; SSSE3-NEXT:    psrad $8, %xmm0
929; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
930; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
931; SSSE3-NEXT:    pxor %xmm1, %xmm0
932; SSSE3-NEXT:    movd %xmm2, %eax
933; SSSE3-NEXT:    movw %ax, (%rdi)
934; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
935; SSSE3-NEXT:    movd %xmm1, %ecx
936; SSSE3-NEXT:    movw %cx, 9(%rdi)
937; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
938; SSSE3-NEXT:    movd %xmm1, %edx
939; SSSE3-NEXT:    movw %dx, 6(%rdi)
940; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
941; SSSE3-NEXT:    movd %xmm1, %esi
942; SSSE3-NEXT:    movw %si, 3(%rdi)
943; SSSE3-NEXT:    shrl $16, %eax
944; SSSE3-NEXT:    movb %al, 2(%rdi)
945; SSSE3-NEXT:    shrl $16, %ecx
946; SSSE3-NEXT:    movb %cl, 11(%rdi)
947; SSSE3-NEXT:    shrl $16, %edx
948; SSSE3-NEXT:    movb %dl, 8(%rdi)
949; SSSE3-NEXT:    shrl $16, %esi
950; SSSE3-NEXT:    movb %sil, 5(%rdi)
951; SSSE3-NEXT:    retq
952;
953; SSE41-LABEL: ssubo_v4i24:
954; SSE41:       # %bb.0:
955; SSE41-NEXT:    movdqa %xmm0, %xmm2
956; SSE41-NEXT:    pslld $8, %xmm1
957; SSE41-NEXT:    psrad $8, %xmm1
958; SSE41-NEXT:    pslld $8, %xmm2
959; SSE41-NEXT:    psrad $8, %xmm2
960; SSE41-NEXT:    psubd %xmm1, %xmm2
961; SSE41-NEXT:    movdqa %xmm2, %xmm0
962; SSE41-NEXT:    pslld $8, %xmm0
963; SSE41-NEXT:    psrad $8, %xmm0
964; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
965; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
966; SSE41-NEXT:    pxor %xmm1, %xmm0
967; SSE41-NEXT:    pextrd $3, %xmm2, %eax
968; SSE41-NEXT:    movw %ax, 9(%rdi)
969; SSE41-NEXT:    pextrd $2, %xmm2, %ecx
970; SSE41-NEXT:    movw %cx, 6(%rdi)
971; SSE41-NEXT:    pextrd $1, %xmm2, %edx
972; SSE41-NEXT:    movw %dx, 3(%rdi)
973; SSE41-NEXT:    movd %xmm2, %esi
974; SSE41-NEXT:    movw %si, (%rdi)
975; SSE41-NEXT:    shrl $16, %eax
976; SSE41-NEXT:    movb %al, 11(%rdi)
977; SSE41-NEXT:    shrl $16, %ecx
978; SSE41-NEXT:    movb %cl, 8(%rdi)
979; SSE41-NEXT:    shrl $16, %edx
980; SSE41-NEXT:    movb %dl, 5(%rdi)
981; SSE41-NEXT:    shrl $16, %esi
982; SSE41-NEXT:    movb %sil, 2(%rdi)
983; SSE41-NEXT:    retq
984;
985; AVX1-LABEL: ssubo_v4i24:
986; AVX1:       # %bb.0:
987; AVX1-NEXT:    vpslld $8, %xmm1, %xmm1
988; AVX1-NEXT:    vpsrad $8, %xmm1, %xmm1
989; AVX1-NEXT:    vpslld $8, %xmm0, %xmm0
990; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
991; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
992; AVX1-NEXT:    vpslld $8, %xmm1, %xmm0
993; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
994; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
995; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
996; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
997; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
998; AVX1-NEXT:    movw %ax, 9(%rdi)
999; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
1000; AVX1-NEXT:    movw %cx, 6(%rdi)
1001; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
1002; AVX1-NEXT:    movw %dx, 3(%rdi)
1003; AVX1-NEXT:    vmovd %xmm1, %esi
1004; AVX1-NEXT:    movw %si, (%rdi)
1005; AVX1-NEXT:    shrl $16, %eax
1006; AVX1-NEXT:    movb %al, 11(%rdi)
1007; AVX1-NEXT:    shrl $16, %ecx
1008; AVX1-NEXT:    movb %cl, 8(%rdi)
1009; AVX1-NEXT:    shrl $16, %edx
1010; AVX1-NEXT:    movb %dl, 5(%rdi)
1011; AVX1-NEXT:    shrl $16, %esi
1012; AVX1-NEXT:    movb %sil, 2(%rdi)
1013; AVX1-NEXT:    retq
1014;
1015; AVX2-LABEL: ssubo_v4i24:
1016; AVX2:       # %bb.0:
1017; AVX2-NEXT:    vpslld $8, %xmm1, %xmm1
1018; AVX2-NEXT:    vpsrad $8, %xmm1, %xmm1
1019; AVX2-NEXT:    vpslld $8, %xmm0, %xmm0
1020; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0
1021; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
1022; AVX2-NEXT:    vpslld $8, %xmm1, %xmm0
1023; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0
1024; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1025; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1026; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1027; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
1028; AVX2-NEXT:    movw %ax, 9(%rdi)
1029; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
1030; AVX2-NEXT:    movw %cx, 6(%rdi)
1031; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
1032; AVX2-NEXT:    movw %dx, 3(%rdi)
1033; AVX2-NEXT:    vmovd %xmm1, %esi
1034; AVX2-NEXT:    movw %si, (%rdi)
1035; AVX2-NEXT:    shrl $16, %eax
1036; AVX2-NEXT:    movb %al, 11(%rdi)
1037; AVX2-NEXT:    shrl $16, %ecx
1038; AVX2-NEXT:    movb %cl, 8(%rdi)
1039; AVX2-NEXT:    shrl $16, %edx
1040; AVX2-NEXT:    movb %dl, 5(%rdi)
1041; AVX2-NEXT:    shrl $16, %esi
1042; AVX2-NEXT:    movb %sil, 2(%rdi)
1043; AVX2-NEXT:    retq
1044;
1045; AVX512-LABEL: ssubo_v4i24:
1046; AVX512:       # %bb.0:
1047; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1
1048; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1
1049; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0
1050; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
1051; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
1052; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
1053; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
1054; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1055; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
1056; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
1057; AVX512-NEXT:    movw %ax, 9(%rdi)
1058; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
1059; AVX512-NEXT:    movw %cx, 6(%rdi)
1060; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
1061; AVX512-NEXT:    movw %dx, 3(%rdi)
1062; AVX512-NEXT:    vmovd %xmm1, %esi
1063; AVX512-NEXT:    movw %si, (%rdi)
1064; AVX512-NEXT:    shrl $16, %eax
1065; AVX512-NEXT:    movb %al, 11(%rdi)
1066; AVX512-NEXT:    shrl $16, %ecx
1067; AVX512-NEXT:    movb %cl, 8(%rdi)
1068; AVX512-NEXT:    shrl $16, %edx
1069; AVX512-NEXT:    movb %dl, 5(%rdi)
1070; AVX512-NEXT:    shrl $16, %esi
1071; AVX512-NEXT:    movb %sil, 2(%rdi)
1072; AVX512-NEXT:    retq
1073  %t = call {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
1074  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
1075  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
1076  %res = sext <4 x i1> %obit to <4 x i32>
1077  store <4 x i24> %val, <4 x i24>* %p2
1078  ret <4 x i32> %res
1079}
1080
1081define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
1082; SSE-LABEL: ssubo_v4i1:
1083; SSE:       # %bb.0:
1084; SSE-NEXT:    pslld $31, %xmm1
1085; SSE-NEXT:    psrad $31, %xmm1
1086; SSE-NEXT:    pslld $31, %xmm0
1087; SSE-NEXT:    psrad $31, %xmm0
1088; SSE-NEXT:    psubd %xmm1, %xmm0
1089; SSE-NEXT:    movdqa %xmm0, %xmm1
1090; SSE-NEXT:    pslld $31, %xmm1
1091; SSE-NEXT:    movmskps %xmm1, %eax
1092; SSE-NEXT:    psrad $31, %xmm1
1093; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
1094; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
1095; SSE-NEXT:    pxor %xmm0, %xmm1
1096; SSE-NEXT:    movb %al, (%rdi)
1097; SSE-NEXT:    movdqa %xmm1, %xmm0
1098; SSE-NEXT:    retq
1099;
1100; AVX1-LABEL: ssubo_v4i1:
1101; AVX1:       # %bb.0:
1102; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
1103; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
1104; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
1105; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
1106; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1107; AVX1-NEXT:    vpslld $31, %xmm0, %xmm1
1108; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
1109; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
1110; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1111; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1112; AVX1-NEXT:    vmovmskps %xmm1, %eax
1113; AVX1-NEXT:    movb %al, (%rdi)
1114; AVX1-NEXT:    retq
1115;
1116; AVX2-LABEL: ssubo_v4i1:
1117; AVX2:       # %bb.0:
1118; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
1119; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
1120; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
1121; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
1122; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1123; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1
1124; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
1125; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
1126; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1127; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1128; AVX2-NEXT:    vmovmskps %xmm1, %eax
1129; AVX2-NEXT:    movb %al, (%rdi)
1130; AVX2-NEXT:    retq
1131;
1132; AVX512-LABEL: ssubo_v4i1:
1133; AVX512:       # %bb.0:
1134; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
1135; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k0
1136; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
1137; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
1138; AVX512-NEXT:    vptestnmd %xmm1, %xmm1, %k2 {%k1}
1139; AVX512-NEXT:    kxorw %k0, %k1, %k0
1140; AVX512-NEXT:    kxorw %k2, %k0, %k1
1141; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1142; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1143; AVX512-NEXT:    kshiftlw $12, %k0, %k0
1144; AVX512-NEXT:    kshiftrw $12, %k0, %k0
1145; AVX512-NEXT:    kmovd %k0, %eax
1146; AVX512-NEXT:    movb %al, (%rdi)
1147; AVX512-NEXT:    retq
1148  %t = call {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
1149  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
1150  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
1151  %res = sext <4 x i1> %obit to <4 x i32>
1152  store <4 x i1> %val, <4 x i1>* %p2
1153  ret <4 x i32> %res
1154}
1155
1156define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
1157; SSE2-LABEL: ssubo_v2i128:
1158; SSE2:       # %bb.0:
1159; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1160; SSE2-NEXT:    subq %r8, %rdi
1161; SSE2-NEXT:    sbbq %r9, %rsi
1162; SSE2-NEXT:    seto %r8b
1163; SSE2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1164; SSE2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1165; SSE2-NEXT:    seto %al
1166; SSE2-NEXT:    movzbl %al, %eax
1167; SSE2-NEXT:    negl %eax
1168; SSE2-NEXT:    movd %eax, %xmm1
1169; SSE2-NEXT:    movzbl %r8b, %eax
1170; SSE2-NEXT:    negl %eax
1171; SSE2-NEXT:    movd %eax, %xmm0
1172; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1173; SSE2-NEXT:    movq %rdx, 16(%r10)
1174; SSE2-NEXT:    movq %rdi, (%r10)
1175; SSE2-NEXT:    movq %rcx, 24(%r10)
1176; SSE2-NEXT:    movq %rsi, 8(%r10)
1177; SSE2-NEXT:    retq
1178;
1179; SSSE3-LABEL: ssubo_v2i128:
1180; SSSE3:       # %bb.0:
1181; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1182; SSSE3-NEXT:    subq %r8, %rdi
1183; SSSE3-NEXT:    sbbq %r9, %rsi
1184; SSSE3-NEXT:    seto %r8b
1185; SSSE3-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1186; SSSE3-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1187; SSSE3-NEXT:    seto %al
1188; SSSE3-NEXT:    movzbl %al, %eax
1189; SSSE3-NEXT:    negl %eax
1190; SSSE3-NEXT:    movd %eax, %xmm1
1191; SSSE3-NEXT:    movzbl %r8b, %eax
1192; SSSE3-NEXT:    negl %eax
1193; SSSE3-NEXT:    movd %eax, %xmm0
1194; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1195; SSSE3-NEXT:    movq %rdx, 16(%r10)
1196; SSSE3-NEXT:    movq %rdi, (%r10)
1197; SSSE3-NEXT:    movq %rcx, 24(%r10)
1198; SSSE3-NEXT:    movq %rsi, 8(%r10)
1199; SSSE3-NEXT:    retq
1200;
1201; SSE41-LABEL: ssubo_v2i128:
1202; SSE41:       # %bb.0:
1203; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1204; SSE41-NEXT:    subq %r8, %rdi
1205; SSE41-NEXT:    sbbq %r9, %rsi
1206; SSE41-NEXT:    seto %r8b
1207; SSE41-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1208; SSE41-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1209; SSE41-NEXT:    seto %al
1210; SSE41-NEXT:    movzbl %al, %r9d
1211; SSE41-NEXT:    negl %r9d
1212; SSE41-NEXT:    movzbl %r8b, %eax
1213; SSE41-NEXT:    negl %eax
1214; SSE41-NEXT:    movd %eax, %xmm0
1215; SSE41-NEXT:    pinsrd $1, %r9d, %xmm0
1216; SSE41-NEXT:    movq %rdx, 16(%r10)
1217; SSE41-NEXT:    movq %rdi, (%r10)
1218; SSE41-NEXT:    movq %rcx, 24(%r10)
1219; SSE41-NEXT:    movq %rsi, 8(%r10)
1220; SSE41-NEXT:    retq
1221;
1222; AVX1-LABEL: ssubo_v2i128:
1223; AVX1:       # %bb.0:
1224; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1225; AVX1-NEXT:    subq %r8, %rdi
1226; AVX1-NEXT:    sbbq %r9, %rsi
1227; AVX1-NEXT:    seto %r8b
1228; AVX1-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1229; AVX1-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1230; AVX1-NEXT:    seto %al
1231; AVX1-NEXT:    movzbl %al, %r9d
1232; AVX1-NEXT:    negl %r9d
1233; AVX1-NEXT:    movzbl %r8b, %eax
1234; AVX1-NEXT:    negl %eax
1235; AVX1-NEXT:    vmovd %eax, %xmm0
1236; AVX1-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
1237; AVX1-NEXT:    movq %rdx, 16(%r10)
1238; AVX1-NEXT:    movq %rdi, (%r10)
1239; AVX1-NEXT:    movq %rcx, 24(%r10)
1240; AVX1-NEXT:    movq %rsi, 8(%r10)
1241; AVX1-NEXT:    retq
1242;
1243; AVX2-LABEL: ssubo_v2i128:
1244; AVX2:       # %bb.0:
1245; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1246; AVX2-NEXT:    subq %r8, %rdi
1247; AVX2-NEXT:    sbbq %r9, %rsi
1248; AVX2-NEXT:    seto %r8b
1249; AVX2-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1250; AVX2-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1251; AVX2-NEXT:    seto %al
1252; AVX2-NEXT:    movzbl %al, %r9d
1253; AVX2-NEXT:    negl %r9d
1254; AVX2-NEXT:    movzbl %r8b, %eax
1255; AVX2-NEXT:    negl %eax
1256; AVX2-NEXT:    vmovd %eax, %xmm0
1257; AVX2-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
1258; AVX2-NEXT:    movq %rdx, 16(%r10)
1259; AVX2-NEXT:    movq %rdi, (%r10)
1260; AVX2-NEXT:    movq %rcx, 24(%r10)
1261; AVX2-NEXT:    movq %rsi, 8(%r10)
1262; AVX2-NEXT:    retq
1263;
1264; AVX512-LABEL: ssubo_v2i128:
1265; AVX512:       # %bb.0:
1266; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1267; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
1268; AVX512-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
1269; AVX512-NEXT:    seto %al
1270; AVX512-NEXT:    kmovd %eax, %k0
1271; AVX512-NEXT:    subq %r8, %rdi
1272; AVX512-NEXT:    sbbq %r9, %rsi
1273; AVX512-NEXT:    seto %al
1274; AVX512-NEXT:    andl $1, %eax
1275; AVX512-NEXT:    kmovw %eax, %k1
1276; AVX512-NEXT:    kshiftlw $1, %k0, %k0
1277; AVX512-NEXT:    korw %k0, %k1, %k1
1278; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1279; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1280; AVX512-NEXT:    movq %rdx, 16(%r10)
1281; AVX512-NEXT:    movq %rdi, (%r10)
1282; AVX512-NEXT:    movq %rcx, 24(%r10)
1283; AVX512-NEXT:    movq %rsi, 8(%r10)
1284; AVX512-NEXT:    retq
1285  %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
1286  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
1287  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
1288  %res = sext <2 x i1> %obit to <2 x i32>
1289  store <2 x i128> %val, <2 x i128>* %p2
1290  ret <2 x i32> %res
1291}
1292