1; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
2; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
3; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5
6define void @test1(i16* nocapture %head) nounwind {
7; SSE-LABEL: test1:
8; SSE:       ## BB#0: ## %vector.ph
9; SSE-NEXT:    movdqu (%rdi), %xmm0
10; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
11; SSE-NEXT:    movdqu %xmm0, (%rdi)
12; SSE-NEXT:    retq
13;
14; AVX-LABEL: test1:
15; AVX:       ## BB#0: ## %vector.ph
16; AVX-NEXT:    vmovdqu (%rdi), %xmm0
17; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
18; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
19; AVX-NEXT:    retq
20vector.ph:
21  %0 = getelementptr inbounds i16, i16* %head, i64 0
22  %1 = bitcast i16* %0 to <8 x i16>*
23  %2 = load <8 x i16>, <8 x i16>* %1, align 2
24  %3 = icmp slt <8 x i16> %2, zeroinitializer
25  %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
26  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
27  store <8 x i16> %5, <8 x i16>* %1, align 2
28  ret void
29}
30
31define void @test2(i16* nocapture %head) nounwind {
32; SSE-LABEL: test2:
33; SSE:       ## BB#0: ## %vector.ph
34; SSE-NEXT:    movdqu (%rdi), %xmm0
35; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
36; SSE-NEXT:    movdqu %xmm0, (%rdi)
37; SSE-NEXT:    retq
38;
39; AVX-LABEL: test2:
40; AVX:       ## BB#0: ## %vector.ph
41; AVX-NEXT:    vmovdqu (%rdi), %xmm0
42; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
43; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
44; AVX-NEXT:    retq
45vector.ph:
46  %0 = getelementptr inbounds i16, i16* %head, i64 0
47  %1 = bitcast i16* %0 to <8 x i16>*
48  %2 = load <8 x i16>, <8 x i16>* %1, align 2
49  %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
50  %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
51  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
52  store <8 x i16> %5, <8 x i16>* %1, align 2
53  ret void
54}
55
56define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
57; SSE2-LABEL: test3:
58; SSE2:       ## BB#0: ## %vector.ph
59; SSE2-NEXT:    movd %esi, %xmm0
60; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
61; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
62; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
63; SSE2-NEXT:    movdqu (%rdi), %xmm1
64; SSE2-NEXT:    psubusw %xmm0, %xmm1
65; SSE2-NEXT:    movdqu %xmm1, (%rdi)
66; SSE2-NEXT:    retq
67;
68; SSSE3-LABEL: test3:
69; SSSE3:       ## BB#0: ## %vector.ph
70; SSSE3-NEXT:    movd %esi, %xmm0
71; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
72; SSSE3-NEXT:    movdqu (%rdi), %xmm1
73; SSSE3-NEXT:    psubusw %xmm0, %xmm1
74; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
75; SSSE3-NEXT:    retq
76;
77; AVX1-LABEL: test3:
78; AVX1:       ## BB#0: ## %vector.ph
79; AVX1-NEXT:    vmovd %esi, %xmm0
80; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
81; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
82; AVX1-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
83; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
84; AVX1-NEXT:    retq
85;
86; AVX2-LABEL: test3:
87; AVX2:       ## BB#0: ## %vector.ph
88; AVX2-NEXT:    vmovd %esi, %xmm0
89; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
90; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
91; AVX2-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
92; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
93; AVX2-NEXT:    retq
94vector.ph:
95  %0 = insertelement <8 x i16> undef, i16 %w, i32 0
96  %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
97  %1 = getelementptr inbounds i16, i16* %head, i64 0
98  %2 = bitcast i16* %1 to <8 x i16>*
99  %3 = load <8 x i16>, <8 x i16>* %2, align 2
100  %4 = icmp ult <8 x i16> %3, %broadcast15
101  %5 = sub <8 x i16> %3, %broadcast15
102  %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
103  store <8 x i16> %6, <8 x i16>* %2, align 2
104  ret void
105}
106
107define void @test4(i8* nocapture %head) nounwind {
108; SSE-LABEL: test4:
109; SSE:       ## BB#0: ## %vector.ph
110; SSE-NEXT:    movdqu (%rdi), %xmm0
111; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
112; SSE-NEXT:    movdqu %xmm0, (%rdi)
113; SSE-NEXT:    retq
114;
115; AVX-LABEL: test4:
116; AVX:       ## BB#0: ## %vector.ph
117; AVX-NEXT:    vmovdqu (%rdi), %xmm0
118; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
119; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
120; AVX-NEXT:    retq
121vector.ph:
122  %0 = getelementptr inbounds i8, i8* %head, i64 0
123  %1 = bitcast i8* %0 to <16 x i8>*
124  %2 = load <16 x i8>, <16 x i8>* %1, align 1
125  %3 = icmp slt <16 x i8> %2, zeroinitializer
126  %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
127  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
128  store <16 x i8> %5, <16 x i8>* %1, align 1
129  ret void
130}
131
132define void @test5(i8* nocapture %head) nounwind {
133; SSE-LABEL: test5:
134; SSE:       ## BB#0: ## %vector.ph
135; SSE-NEXT:    movdqu (%rdi), %xmm0
136; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
137; SSE-NEXT:    movdqu %xmm0, (%rdi)
138; SSE-NEXT:    retq
139;
140; AVX-LABEL: test5:
141; AVX:       ## BB#0: ## %vector.ph
142; AVX-NEXT:    vmovdqu (%rdi), %xmm0
143; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
144; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
145; AVX-NEXT:    retq
146vector.ph:
147  %0 = getelementptr inbounds i8, i8* %head, i64 0
148  %1 = bitcast i8* %0 to <16 x i8>*
149  %2 = load <16 x i8>, <16 x i8>* %1, align 1
150  %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
151  %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
152  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
153  store <16 x i8> %5, <16 x i8>* %1, align 1
154  ret void
155}
156
157define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
158; SSE2-LABEL: test6:
159; SSE2:       ## BB#0: ## %vector.ph
160; SSE2-NEXT:    movd %esi, %xmm0
161; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
162; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
163; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
164; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
165; SSE2-NEXT:    movdqu (%rdi), %xmm1
166; SSE2-NEXT:    psubusb %xmm0, %xmm1
167; SSE2-NEXT:    movdqu %xmm1, (%rdi)
168; SSE2-NEXT:    retq
169;
170; SSSE3-LABEL: test6:
171; SSSE3:       ## BB#0: ## %vector.ph
172; SSSE3-NEXT:    movd %esi, %xmm0
173; SSSE3-NEXT:    pxor %xmm1, %xmm1
174; SSSE3-NEXT:    pshufb %xmm1, %xmm0
175; SSSE3-NEXT:    movdqu (%rdi), %xmm1
176; SSSE3-NEXT:    psubusb %xmm0, %xmm1
177; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
178; SSSE3-NEXT:    retq
179;
180; AVX1-LABEL: test6:
181; AVX1:       ## BB#0: ## %vector.ph
182; AVX1-NEXT:    vmovd %esi, %xmm0
183; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
184; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
185; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
186; AVX1-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
187; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
188; AVX1-NEXT:    retq
189;
190; AVX2-LABEL: test6:
191; AVX2:       ## BB#0: ## %vector.ph
192; AVX2-NEXT:    vmovd %esi, %xmm0
193; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
194; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
195; AVX2-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
196; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
197; AVX2-NEXT:    retq
198vector.ph:
199  %0 = insertelement <16 x i8> undef, i8 %w, i32 0
200  %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
201  %1 = getelementptr inbounds i8, i8* %head, i64 0
202  %2 = bitcast i8* %1 to <16 x i8>*
203  %3 = load <16 x i8>, <16 x i8>* %2, align 1
204  %4 = icmp ult <16 x i8> %3, %broadcast15
205  %5 = sub <16 x i8> %3, %broadcast15
206  %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
207  store <16 x i8> %6, <16 x i8>* %2, align 1
208  ret void
209}
210
211define void @test7(i16* nocapture %head) nounwind {
212; SSE-LABEL: test7:
213; SSE:       ## BB#0: ## %vector.ph
214; SSE-NEXT:    movdqu (%rdi), %xmm0
215; SSE-NEXT:    movdqu 16(%rdi), %xmm1
216; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
217; SSE-NEXT:    psubusw %xmm2, %xmm0
218; SSE-NEXT:    psubusw %xmm2, %xmm1
219; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
220; SSE-NEXT:    movdqu %xmm0, (%rdi)
221; SSE-NEXT:    retq
222;
223; AVX1-LABEL: test7:
224; AVX1:       ## BB#0: ## %vector.ph
225; AVX1-NEXT:    vmovups (%rdi), %ymm0
226; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
227; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
228; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm1
229; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm2
230; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
231; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
232; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
233; AVX1-NEXT:    vmovups %ymm0, (%rdi)
234; AVX1-NEXT:    vzeroupper
235; AVX1-NEXT:    retq
236;
237; AVX2-LABEL: test7:
238; AVX2:       ## BB#0: ## %vector.ph
239; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
240; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
241; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
242; AVX2-NEXT:    vzeroupper
243; AVX2-NEXT:    retq
244vector.ph:
245  %0 = getelementptr inbounds i16, i16* %head, i64 0
246  %1 = bitcast i16* %0 to <16 x i16>*
247  %2 = load <16 x i16>, <16 x i16>* %1, align 2
248  %3 = icmp slt <16 x i16> %2, zeroinitializer
249  %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
250  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
251  store <16 x i16> %5, <16 x i16>* %1, align 2
252  ret void
253}
254
255define void @test8(i16* nocapture %head) nounwind {
256; SSE-LABEL: test8:
257; SSE:       ## BB#0: ## %vector.ph
258; SSE-NEXT:    movdqu (%rdi), %xmm0
259; SSE-NEXT:    movdqu 16(%rdi), %xmm1
260; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
261; SSE-NEXT:    psubusw %xmm2, %xmm0
262; SSE-NEXT:    psubusw %xmm2, %xmm1
263; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
264; SSE-NEXT:    movdqu %xmm0, (%rdi)
265; SSE-NEXT:    retq
266;
267; AVX1-LABEL: test8:
268; AVX1:       ## BB#0: ## %vector.ph
269; AVX1-NEXT:    vmovups (%rdi), %ymm0
270; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
271; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
272; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
273; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
274; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm3, %xmm3
275; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
276; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm2, %xmm2
277; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
278; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
279; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
280; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
281; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
282; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
283; AVX1-NEXT:    vmovups %ymm0, (%rdi)
284; AVX1-NEXT:    vzeroupper
285; AVX1-NEXT:    retq
286;
287; AVX2-LABEL: test8:
288; AVX2:       ## BB#0: ## %vector.ph
289; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
290; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
291; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
292; AVX2-NEXT:    vzeroupper
293; AVX2-NEXT:    retq
294vector.ph:
295  %0 = getelementptr inbounds i16, i16* %head, i64 0
296  %1 = bitcast i16* %0 to <16 x i16>*
297  %2 = load <16 x i16>, <16 x i16>* %1, align 2
298  %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
299  %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
300  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
301  store <16 x i16> %5, <16 x i16>* %1, align 2
302  ret void
303
304}
305
306define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
307; SSE2-LABEL: test9:
308; SSE2:       ## BB#0: ## %vector.ph
309; SSE2-NEXT:    movd %esi, %xmm0
310; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
311; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
312; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
313; SSE2-NEXT:    movdqu (%rdi), %xmm1
314; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
315; SSE2-NEXT:    psubusw %xmm0, %xmm1
316; SSE2-NEXT:    psubusw %xmm0, %xmm2
317; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
318; SSE2-NEXT:    movdqu %xmm1, (%rdi)
319; SSE2-NEXT:    retq
320;
321; SSSE3-LABEL: test9:
322; SSSE3:       ## BB#0: ## %vector.ph
323; SSSE3-NEXT:    movd %esi, %xmm0
324; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
325; SSSE3-NEXT:    movdqu (%rdi), %xmm1
326; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
327; SSSE3-NEXT:    psubusw %xmm0, %xmm1
328; SSSE3-NEXT:    psubusw %xmm0, %xmm2
329; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
330; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
331; SSSE3-NEXT:    retq
332;
333; AVX1-LABEL: test9:
334; AVX1:       ## BB#0: ## %vector.ph
335; AVX1-NEXT:    vmovups (%rdi), %ymm0
336; AVX1-NEXT:    vmovd %esi, %xmm1
337; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
338; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
339; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm3
340; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm4
341; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
342; AVX1-NEXT:    vpmaxuw %xmm1, %xmm2, %xmm4
343; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm2, %xmm2
344; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
345; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
346; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
347; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
348; AVX1-NEXT:    vmovups %ymm0, (%rdi)
349; AVX1-NEXT:    vzeroupper
350; AVX1-NEXT:    retq
351;
352; AVX2-LABEL: test9:
353; AVX2:       ## BB#0: ## %vector.ph
354; AVX2-NEXT:    vmovd %esi, %xmm0
355; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
356; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
357; AVX2-NEXT:    vpsubusw %ymm0, %ymm1, %ymm0
358; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
359; AVX2-NEXT:    vzeroupper
360; AVX2-NEXT:    retq
361vector.ph:
362  %0 = insertelement <16 x i16> undef, i16 %w, i32 0
363  %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
364  %1 = getelementptr inbounds i16, i16* %head, i64 0
365  %2 = bitcast i16* %1 to <16 x i16>*
366  %3 = load <16 x i16>, <16 x i16>* %2, align 2
367  %4 = icmp ult <16 x i16> %3, %broadcast15
368  %5 = sub <16 x i16> %3, %broadcast15
369  %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
370  store <16 x i16> %6, <16 x i16>* %2, align 2
371  ret void
372}
373
374define void @test10(i8* nocapture %head) nounwind {
375; SSE-LABEL: test10:
376; SSE:       ## BB#0: ## %vector.ph
377; SSE-NEXT:    movdqu (%rdi), %xmm0
378; SSE-NEXT:    movdqu 16(%rdi), %xmm1
379; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
380; SSE-NEXT:    psubusb %xmm2, %xmm0
381; SSE-NEXT:    psubusb %xmm2, %xmm1
382; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
383; SSE-NEXT:    movdqu %xmm0, (%rdi)
384; SSE-NEXT:    retq
385;
386; AVX1-LABEL: test10:
387; AVX1:       ## BB#0: ## %vector.ph
388; AVX1-NEXT:    vmovups (%rdi), %ymm0
389; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
390; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
391; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
392; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
393; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
394; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
395; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
396; AVX1-NEXT:    vmovups %ymm0, (%rdi)
397; AVX1-NEXT:    vzeroupper
398; AVX1-NEXT:    retq
399;
400; AVX2-LABEL: test10:
401; AVX2:       ## BB#0: ## %vector.ph
402; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
403; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
404; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
405; AVX2-NEXT:    vzeroupper
406; AVX2-NEXT:    retq
407vector.ph:
408  %0 = getelementptr inbounds i8, i8* %head, i64 0
409  %1 = bitcast i8* %0 to <32 x i8>*
410  %2 = load <32 x i8>, <32 x i8>* %1, align 1
411  %3 = icmp slt <32 x i8> %2, zeroinitializer
412  %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
413  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
414  store <32 x i8> %5, <32 x i8>* %1, align 1
415  ret void
416
417}
418
419define void @test11(i8* nocapture %head) nounwind {
420; SSE-LABEL: test11:
421; SSE:       ## BB#0: ## %vector.ph
422; SSE-NEXT:    movdqu (%rdi), %xmm0
423; SSE-NEXT:    movdqu 16(%rdi), %xmm1
424; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
425; SSE-NEXT:    psubusb %xmm2, %xmm0
426; SSE-NEXT:    psubusb %xmm2, %xmm1
427; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
428; SSE-NEXT:    movdqu %xmm0, (%rdi)
429; SSE-NEXT:    retq
430;
431; AVX1-LABEL: test11:
432; AVX1:       ## BB#0: ## %vector.ph
433; AVX1-NEXT:    vmovups (%rdi), %ymm0
434; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
435; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
436; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
437; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
438; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm3, %xmm3
439; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
440; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm2, %xmm2
441; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
442; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
443; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
444; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
445; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
446; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
447; AVX1-NEXT:    vmovups %ymm0, (%rdi)
448; AVX1-NEXT:    vzeroupper
449; AVX1-NEXT:    retq
450;
451; AVX2-LABEL: test11:
452; AVX2:       ## BB#0: ## %vector.ph
453; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
454; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
455; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
456; AVX2-NEXT:    vzeroupper
457; AVX2-NEXT:    retq
458vector.ph:
459  %0 = getelementptr inbounds i8, i8* %head, i64 0
460  %1 = bitcast i8* %0 to <32 x i8>*
461  %2 = load <32 x i8>, <32 x i8>* %1, align 1
462  %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
463  %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
464  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
465  store <32 x i8> %5, <32 x i8>* %1, align 1
466  ret void
467}
468
469define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
470; SSE2-LABEL: test12:
471; SSE2:       ## BB#0: ## %vector.ph
472; SSE2-NEXT:    movd %esi, %xmm0
473; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
474; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
475; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
476; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
477; SSE2-NEXT:    movdqu (%rdi), %xmm1
478; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
479; SSE2-NEXT:    psubusb %xmm0, %xmm1
480; SSE2-NEXT:    psubusb %xmm0, %xmm2
481; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
482; SSE2-NEXT:    movdqu %xmm1, (%rdi)
483; SSE2-NEXT:    retq
484;
485; SSSE3-LABEL: test12:
486; SSSE3:       ## BB#0: ## %vector.ph
487; SSSE3-NEXT:    movd %esi, %xmm0
488; SSSE3-NEXT:    pxor %xmm1, %xmm1
489; SSSE3-NEXT:    pshufb %xmm1, %xmm0
490; SSSE3-NEXT:    movdqu (%rdi), %xmm1
491; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
492; SSSE3-NEXT:    psubusb %xmm0, %xmm1
493; SSSE3-NEXT:    psubusb %xmm0, %xmm2
494; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
495; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
496; SSSE3-NEXT:    retq
497;
498; AVX1-LABEL: test12:
499; AVX1:       ## BB#0: ## %vector.ph
500; AVX1-NEXT:    vmovups (%rdi), %ymm0
501; AVX1-NEXT:    vmovd %esi, %xmm1
502; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
503; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
504; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
505; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
506; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm4
507; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
508; AVX1-NEXT:    vpmaxub %xmm1, %xmm2, %xmm4
509; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
510; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
511; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
512; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
513; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
514; AVX1-NEXT:    vmovups %ymm0, (%rdi)
515; AVX1-NEXT:    vzeroupper
516; AVX1-NEXT:    retq
517;
518; AVX2-LABEL: test12:
519; AVX2:       ## BB#0: ## %vector.ph
520; AVX2-NEXT:    vmovd %esi, %xmm0
521; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
522; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
523; AVX2-NEXT:    vpsubusb %ymm0, %ymm1, %ymm0
524; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
525; AVX2-NEXT:    vzeroupper
526; AVX2-NEXT:    retq
527vector.ph:
528  %0 = insertelement <32 x i8> undef, i8 %w, i32 0
529  %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
530  %1 = getelementptr inbounds i8, i8* %head, i64 0
531  %2 = bitcast i8* %1 to <32 x i8>*
532  %3 = load <32 x i8>, <32 x i8>* %2, align 1
533  %4 = icmp ult <32 x i8> %3, %broadcast15
534  %5 = sub <32 x i8> %3, %broadcast15
535  %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
536  store <32 x i8> %6, <32 x i8>* %2, align 1
537  ret void
538}
539