1; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41
2; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE
3; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX
4
5target triple = "x86_64-unknown-unknown"
6
7define <4 x i32> @test1(<4 x i32> %a) #0 {
8; SSE41-LABEL: test1:
9; SSE41:       # BB#0:
10; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
11; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
12; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
13; SSE41-NEXT:    pmuludq %xmm2, %xmm3
14; SSE41-NEXT:    pmuludq %xmm0, %xmm1
15; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
16; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
17; SSE41-NEXT:    psubd %xmm1, %xmm0
18; SSE41-NEXT:    psrld $1, %xmm0
19; SSE41-NEXT:    paddd %xmm1, %xmm0
20; SSE41-NEXT:    psrld $2, %xmm0
21; SSE41-NEXT:    retq
22;
23; SSE-LABEL: test1:
24; SSE:       # BB#0:
25; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
26; SSE-NEXT:    movdqa %xmm0, %xmm2
27; SSE-NEXT:    pmuludq %xmm1, %xmm2
28; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
29; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
30; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
31; SSE-NEXT:    pmuludq %xmm1, %xmm3
32; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
33; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
34; SSE-NEXT:    psubd %xmm2, %xmm0
35; SSE-NEXT:    psrld $1, %xmm0
36; SSE-NEXT:    paddd %xmm2, %xmm0
37; SSE-NEXT:    psrld $2, %xmm0
38; SSE-NEXT:    retq
39;
40; AVX-LABEL: test1:
41; AVX:       # BB#0:
42; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
43; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
44; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
45; AVX-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
46; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
47; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
48; AVX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
49; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
50; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
51; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
52; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
53; AVX-NEXT:    retq
54  %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
55  ret <4 x i32> %div
56}
57
58define <8 x i32> @test2(<8 x i32> %a) #0 {
59; SSE41-LABEL: test2:
60; SSE41:       # BB#0:
61; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
62; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
63; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
64; SSE41-NEXT:    pmuludq %xmm3, %xmm4
65; SSE41-NEXT:    movdqa %xmm0, %xmm5
66; SSE41-NEXT:    pmuludq %xmm2, %xmm5
67; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
68; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
69; SSE41-NEXT:    psubd %xmm5, %xmm0
70; SSE41-NEXT:    psrld $1, %xmm0
71; SSE41-NEXT:    paddd %xmm5, %xmm0
72; SSE41-NEXT:    psrld $2, %xmm0
73; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
74; SSE41-NEXT:    pmuludq %xmm3, %xmm4
75; SSE41-NEXT:    pmuludq %xmm1, %xmm2
76; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
77; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
78; SSE41-NEXT:    psubd %xmm2, %xmm1
79; SSE41-NEXT:    psrld $1, %xmm1
80; SSE41-NEXT:    paddd %xmm2, %xmm1
81; SSE41-NEXT:    psrld $2, %xmm1
82; SSE41-NEXT:    retq
83;
84; SSE-LABEL: test2:
85; SSE:       # BB#0:
86; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
87; SSE-NEXT:    movdqa %xmm0, %xmm3
88; SSE-NEXT:    pmuludq %xmm2, %xmm3
89; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
90; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
91; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
92; SSE-NEXT:    pmuludq %xmm4, %xmm5
93; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
94; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
95; SSE-NEXT:    psubd %xmm3, %xmm0
96; SSE-NEXT:    psrld $1, %xmm0
97; SSE-NEXT:    paddd %xmm3, %xmm0
98; SSE-NEXT:    psrld $2, %xmm0
99; SSE-NEXT:    pmuludq %xmm1, %xmm2
100; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
101; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
102; SSE-NEXT:    pmuludq %xmm4, %xmm3
103; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
104; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
105; SSE-NEXT:    psubd %xmm2, %xmm1
106; SSE-NEXT:    psrld $1, %xmm1
107; SSE-NEXT:    paddd %xmm2, %xmm1
108; SSE-NEXT:    psrld $2, %xmm1
109; SSE-NEXT:    retq
110;
111; AVX-LABEL: test2:
112; AVX:       # BB#0:
113; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
114; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
115; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
116; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
117; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
118; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
119; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
120; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
121; AVX-NEXT:    vpsrld $1, %ymm0, %ymm0
122; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
123; AVX-NEXT:    vpsrld $2, %ymm0, %ymm0
124; AVX-NEXT:    retq
125  %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
126  ret <8 x i32> %div
127}
128
129define <8 x i16> @test3(<8 x i16> %a) #0 {
130; SSE41-LABEL: test3:
131; SSE41:       # BB#0:
132; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
133; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
134; SSE41-NEXT:    psubw %xmm1, %xmm0
135; SSE41-NEXT:    psrlw $1, %xmm0
136; SSE41-NEXT:    paddw %xmm1, %xmm0
137; SSE41-NEXT:    psrlw $2, %xmm0
138; SSE41-NEXT:    retq
139;
140; SSE-LABEL: test3:
141; SSE:       # BB#0:
142; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
143; SSE-NEXT:    pmulhuw %xmm0, %xmm1
144; SSE-NEXT:    psubw %xmm1, %xmm0
145; SSE-NEXT:    psrlw $1, %xmm0
146; SSE-NEXT:    paddw %xmm1, %xmm0
147; SSE-NEXT:    psrlw $2, %xmm0
148; SSE-NEXT:    retq
149;
150; AVX-LABEL: test3:
151; AVX:       # BB#0:
152; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
153; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
154; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
155; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
156; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm0
157; AVX-NEXT:    retq
158  %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
159  ret <8 x i16> %div
160}
161
162define <16 x i16> @test4(<16 x i16> %a) #0 {
163; SSE41-LABEL: test4:
164; SSE41:       # BB#0:
165; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
166; SSE41-NEXT:    movdqa %xmm0, %xmm3
167; SSE41-NEXT:    pmulhuw %xmm2, %xmm3
168; SSE41-NEXT:    psubw %xmm3, %xmm0
169; SSE41-NEXT:    psrlw $1, %xmm0
170; SSE41-NEXT:    paddw %xmm3, %xmm0
171; SSE41-NEXT:    psrlw $2, %xmm0
172; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
173; SSE41-NEXT:    psubw %xmm2, %xmm1
174; SSE41-NEXT:    psrlw $1, %xmm1
175; SSE41-NEXT:    paddw %xmm2, %xmm1
176; SSE41-NEXT:    psrlw $2, %xmm1
177; SSE41-NEXT:    retq
178;
179; SSE-LABEL: test4:
180; SSE:       # BB#0:
181; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
182; SSE-NEXT:    movdqa %xmm0, %xmm3
183; SSE-NEXT:    pmulhuw %xmm2, %xmm3
184; SSE-NEXT:    psubw %xmm3, %xmm0
185; SSE-NEXT:    psrlw $1, %xmm0
186; SSE-NEXT:    paddw %xmm3, %xmm0
187; SSE-NEXT:    psrlw $2, %xmm0
188; SSE-NEXT:    pmulhuw %xmm1, %xmm2
189; SSE-NEXT:    psubw %xmm2, %xmm1
190; SSE-NEXT:    psrlw $1, %xmm1
191; SSE-NEXT:    paddw %xmm2, %xmm1
192; SSE-NEXT:    psrlw $2, %xmm1
193; SSE-NEXT:    retq
194;
195; AVX-LABEL: test4:
196; AVX:       # BB#0:
197; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
198; AVX-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
199; AVX-NEXT:    vpsrlw $1, %ymm0, %ymm0
200; AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
201; AVX-NEXT:    vpsrlw $2, %ymm0, %ymm0
202; AVX-NEXT:    retq
203  %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
204  ret <16 x i16> %div
205}
206
207define <8 x i16> @test5(<8 x i16> %a) #0 {
208; SSE41-LABEL: test5:
209; SSE41:       # BB#0:
210; SSE41-NEXT:    pmulhw {{.*}}(%rip), %xmm0
211; SSE41-NEXT:    movdqa %xmm0, %xmm1
212; SSE41-NEXT:    psrlw $15, %xmm1
213; SSE41-NEXT:    psraw $1, %xmm0
214; SSE41-NEXT:    paddw %xmm1, %xmm0
215; SSE41-NEXT:    retq
216;
217; SSE-LABEL: test5:
218; SSE:       # BB#0:
219; SSE-NEXT:    pmulhw {{.*}}(%rip), %xmm0
220; SSE-NEXT:    movdqa %xmm0, %xmm1
221; SSE-NEXT:    psrlw $15, %xmm1
222; SSE-NEXT:    psraw $1, %xmm0
223; SSE-NEXT:    paddw %xmm1, %xmm0
224; SSE-NEXT:    retq
225;
226; AVX-LABEL: test5:
227; AVX:       # BB#0:
228; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
229; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
230; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
231; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
232; AVX-NEXT:    retq
233  %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
234  ret <8 x i16> %div
235}
236
237define <16 x i16> @test6(<16 x i16> %a) #0 {
238; SSE41-LABEL: test6:
239; SSE41:       # BB#0:
240; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
241; SSE41-NEXT:    pmulhw %xmm2, %xmm0
242; SSE41-NEXT:    movdqa %xmm0, %xmm3
243; SSE41-NEXT:    psrlw $15, %xmm3
244; SSE41-NEXT:    psraw $1, %xmm0
245; SSE41-NEXT:    paddw %xmm3, %xmm0
246; SSE41-NEXT:    pmulhw %xmm2, %xmm1
247; SSE41-NEXT:    movdqa %xmm1, %xmm2
248; SSE41-NEXT:    psrlw $15, %xmm2
249; SSE41-NEXT:    psraw $1, %xmm1
250; SSE41-NEXT:    paddw %xmm2, %xmm1
251; SSE41-NEXT:    retq
252;
253; SSE-LABEL: test6:
254; SSE:       # BB#0:
255; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
256; SSE-NEXT:    pmulhw %xmm2, %xmm0
257; SSE-NEXT:    movdqa %xmm0, %xmm3
258; SSE-NEXT:    psrlw $15, %xmm3
259; SSE-NEXT:    psraw $1, %xmm0
260; SSE-NEXT:    paddw %xmm3, %xmm0
261; SSE-NEXT:    pmulhw %xmm2, %xmm1
262; SSE-NEXT:    movdqa %xmm1, %xmm2
263; SSE-NEXT:    psrlw $15, %xmm2
264; SSE-NEXT:    psraw $1, %xmm1
265; SSE-NEXT:    paddw %xmm2, %xmm1
266; SSE-NEXT:    retq
267;
268; AVX-LABEL: test6:
269; AVX:       # BB#0:
270; AVX-NEXT:    vpmulhw {{.*}}(%rip), %ymm0, %ymm0
271; AVX-NEXT:    vpsrlw $15, %ymm0, %ymm1
272; AVX-NEXT:    vpsraw $1, %ymm0, %ymm0
273; AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
274; AVX-NEXT:    retq
275  %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
276  ret <16 x i16> %div
277}
278
279define <16 x i8> @test7(<16 x i8> %a) #0 {
280; SSE41-LABEL: test7:
281; SSE41:       # BB#0:
282; SSE41-NEXT:    pextrb $1, %xmm0, %eax
283; SSE41-NEXT:    movsbl %al, %eax
284; SSE41-NEXT:    imull $-109, %eax, %ecx
285; SSE41-NEXT:    shrl $8, %ecx
286; SSE41-NEXT:    addb %cl, %al
287; SSE41-NEXT:    movb %al, %cl
288; SSE41-NEXT:    shrb $7, %cl
289; SSE41-NEXT:    sarb $2, %al
290; SSE41-NEXT:    addb %cl, %al
291; SSE41-NEXT:    movzbl %al, %eax
292; SSE41-NEXT:    pextrb $0, %xmm0, %ecx
293; SSE41-NEXT:    movsbl %cl, %ecx
294; SSE41-NEXT:    imull $-109, %ecx, %edx
295; SSE41-NEXT:    shrl $8, %edx
296; SSE41-NEXT:    addb %dl, %cl
297; SSE41-NEXT:    movb %cl, %dl
298; SSE41-NEXT:    shrb $7, %dl
299; SSE41-NEXT:    sarb $2, %cl
300; SSE41-NEXT:    addb %dl, %cl
301; SSE41-NEXT:    movzbl %cl, %ecx
302; SSE41-NEXT:    movd %ecx, %xmm1
303; SSE41-NEXT:    pinsrb $1, %eax, %xmm1
304; SSE41-NEXT:    pextrb $2, %xmm0, %eax
305; SSE41-NEXT:    movsbl %al, %eax
306; SSE41-NEXT:    imull $-109, %eax, %ecx
307; SSE41-NEXT:    shrl $8, %ecx
308; SSE41-NEXT:    addb %cl, %al
309; SSE41-NEXT:    movb %al, %cl
310; SSE41-NEXT:    shrb $7, %cl
311; SSE41-NEXT:    sarb $2, %al
312; SSE41-NEXT:    addb %cl, %al
313; SSE41-NEXT:    movzbl %al, %eax
314; SSE41-NEXT:    pinsrb $2, %eax, %xmm1
315; SSE41-NEXT:    pextrb $3, %xmm0, %eax
316; SSE41-NEXT:    movsbl %al, %eax
317; SSE41-NEXT:    imull $-109, %eax, %ecx
318; SSE41-NEXT:    shrl $8, %ecx
319; SSE41-NEXT:    addb %cl, %al
320; SSE41-NEXT:    movb %al, %cl
321; SSE41-NEXT:    shrb $7, %cl
322; SSE41-NEXT:    sarb $2, %al
323; SSE41-NEXT:    addb %cl, %al
324; SSE41-NEXT:    movzbl %al, %eax
325; SSE41-NEXT:    pinsrb $3, %eax, %xmm1
326; SSE41-NEXT:    pextrb $4, %xmm0, %eax
327; SSE41-NEXT:    movsbl %al, %eax
328; SSE41-NEXT:    imull $-109, %eax, %ecx
329; SSE41-NEXT:    shrl $8, %ecx
330; SSE41-NEXT:    addb %cl, %al
331; SSE41-NEXT:    movb %al, %cl
332; SSE41-NEXT:    shrb $7, %cl
333; SSE41-NEXT:    sarb $2, %al
334; SSE41-NEXT:    addb %cl, %al
335; SSE41-NEXT:    movzbl %al, %eax
336; SSE41-NEXT:    pinsrb $4, %eax, %xmm1
337; SSE41-NEXT:    pextrb $5, %xmm0, %eax
338; SSE41-NEXT:    movsbl %al, %eax
339; SSE41-NEXT:    imull $-109, %eax, %ecx
340; SSE41-NEXT:    shrl $8, %ecx
341; SSE41-NEXT:    addb %cl, %al
342; SSE41-NEXT:    movb %al, %cl
343; SSE41-NEXT:    shrb $7, %cl
344; SSE41-NEXT:    sarb $2, %al
345; SSE41-NEXT:    addb %cl, %al
346; SSE41-NEXT:    movzbl %al, %eax
347; SSE41-NEXT:    pinsrb $5, %eax, %xmm1
348; SSE41-NEXT:    pextrb $6, %xmm0, %eax
349; SSE41-NEXT:    movsbl %al, %eax
350; SSE41-NEXT:    imull $-109, %eax, %ecx
351; SSE41-NEXT:    shrl $8, %ecx
352; SSE41-NEXT:    addb %cl, %al
353; SSE41-NEXT:    movb %al, %cl
354; SSE41-NEXT:    shrb $7, %cl
355; SSE41-NEXT:    sarb $2, %al
356; SSE41-NEXT:    addb %cl, %al
357; SSE41-NEXT:    movzbl %al, %eax
358; SSE41-NEXT:    pinsrb $6, %eax, %xmm1
359; SSE41-NEXT:    pextrb $7, %xmm0, %eax
360; SSE41-NEXT:    movsbl %al, %eax
361; SSE41-NEXT:    imull $-109, %eax, %ecx
362; SSE41-NEXT:    shrl $8, %ecx
363; SSE41-NEXT:    addb %cl, %al
364; SSE41-NEXT:    movb %al, %cl
365; SSE41-NEXT:    shrb $7, %cl
366; SSE41-NEXT:    sarb $2, %al
367; SSE41-NEXT:    addb %cl, %al
368; SSE41-NEXT:    movzbl %al, %eax
369; SSE41-NEXT:    pinsrb $7, %eax, %xmm1
370; SSE41-NEXT:    pextrb $8, %xmm0, %eax
371; SSE41-NEXT:    movsbl %al, %eax
372; SSE41-NEXT:    imull $-109, %eax, %ecx
373; SSE41-NEXT:    shrl $8, %ecx
374; SSE41-NEXT:    addb %cl, %al
375; SSE41-NEXT:    movb %al, %cl
376; SSE41-NEXT:    shrb $7, %cl
377; SSE41-NEXT:    sarb $2, %al
378; SSE41-NEXT:    addb %cl, %al
379; SSE41-NEXT:    movzbl %al, %eax
380; SSE41-NEXT:    pinsrb $8, %eax, %xmm1
381; SSE41-NEXT:    pextrb $9, %xmm0, %eax
382; SSE41-NEXT:    movsbl %al, %eax
383; SSE41-NEXT:    imull $-109, %eax, %ecx
384; SSE41-NEXT:    shrl $8, %ecx
385; SSE41-NEXT:    addb %cl, %al
386; SSE41-NEXT:    movb %al, %cl
387; SSE41-NEXT:    shrb $7, %cl
388; SSE41-NEXT:    sarb $2, %al
389; SSE41-NEXT:    addb %cl, %al
390; SSE41-NEXT:    movzbl %al, %eax
391; SSE41-NEXT:    pinsrb $9, %eax, %xmm1
392; SSE41-NEXT:    pextrb $10, %xmm0, %eax
393; SSE41-NEXT:    movsbl %al, %eax
394; SSE41-NEXT:    imull $-109, %eax, %ecx
395; SSE41-NEXT:    shrl $8, %ecx
396; SSE41-NEXT:    addb %cl, %al
397; SSE41-NEXT:    movb %al, %cl
398; SSE41-NEXT:    shrb $7, %cl
399; SSE41-NEXT:    sarb $2, %al
400; SSE41-NEXT:    addb %cl, %al
401; SSE41-NEXT:    movzbl %al, %eax
402; SSE41-NEXT:    pinsrb $10, %eax, %xmm1
403; SSE41-NEXT:    pextrb $11, %xmm0, %eax
404; SSE41-NEXT:    movsbl %al, %eax
405; SSE41-NEXT:    imull $-109, %eax, %ecx
406; SSE41-NEXT:    shrl $8, %ecx
407; SSE41-NEXT:    addb %cl, %al
408; SSE41-NEXT:    movb %al, %cl
409; SSE41-NEXT:    shrb $7, %cl
410; SSE41-NEXT:    sarb $2, %al
411; SSE41-NEXT:    addb %cl, %al
412; SSE41-NEXT:    movzbl %al, %eax
413; SSE41-NEXT:    pinsrb $11, %eax, %xmm1
414; SSE41-NEXT:    pextrb $12, %xmm0, %eax
415; SSE41-NEXT:    movsbl %al, %eax
416; SSE41-NEXT:    imull $-109, %eax, %ecx
417; SSE41-NEXT:    shrl $8, %ecx
418; SSE41-NEXT:    addb %cl, %al
419; SSE41-NEXT:    movb %al, %cl
420; SSE41-NEXT:    shrb $7, %cl
421; SSE41-NEXT:    sarb $2, %al
422; SSE41-NEXT:    addb %cl, %al
423; SSE41-NEXT:    movzbl %al, %eax
424; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
425; SSE41-NEXT:    pextrb $13, %xmm0, %eax
426; SSE41-NEXT:    movsbl %al, %eax
427; SSE41-NEXT:    imull $-109, %eax, %ecx
428; SSE41-NEXT:    shrl $8, %ecx
429; SSE41-NEXT:    addb %cl, %al
430; SSE41-NEXT:    movb %al, %cl
431; SSE41-NEXT:    shrb $7, %cl
432; SSE41-NEXT:    sarb $2, %al
433; SSE41-NEXT:    addb %cl, %al
434; SSE41-NEXT:    movzbl %al, %eax
435; SSE41-NEXT:    pinsrb $13, %eax, %xmm1
436; SSE41-NEXT:    pextrb $14, %xmm0, %eax
437; SSE41-NEXT:    movsbl %al, %eax
438; SSE41-NEXT:    imull $-109, %eax, %ecx
439; SSE41-NEXT:    shrl $8, %ecx
440; SSE41-NEXT:    addb %cl, %al
441; SSE41-NEXT:    movb %al, %cl
442; SSE41-NEXT:    shrb $7, %cl
443; SSE41-NEXT:    sarb $2, %al
444; SSE41-NEXT:    addb %cl, %al
445; SSE41-NEXT:    movzbl %al, %eax
446; SSE41-NEXT:    pinsrb $14, %eax, %xmm1
447; SSE41-NEXT:    pextrb $15, %xmm0, %eax
448; SSE41-NEXT:    movsbl %al, %eax
449; SSE41-NEXT:    imull $-109, %eax, %ecx
450; SSE41-NEXT:    shrl $8, %ecx
451; SSE41-NEXT:    addb %cl, %al
452; SSE41-NEXT:    movb %al, %cl
453; SSE41-NEXT:    shrb $7, %cl
454; SSE41-NEXT:    sarb $2, %al
455; SSE41-NEXT:    addb %cl, %al
456; SSE41-NEXT:    movzbl %al, %eax
457; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
458; SSE41-NEXT:    movdqa %xmm1, %xmm0
459; SSE41-NEXT:    retq
460;
461; SSE-LABEL: test7:
462; SSE:       # BB#0:
463; SSE-NEXT:    pushq %rbp
464; SSE-NEXT:    pushq %r14
465; SSE-NEXT:    pushq %rbx
466; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
467; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
468; SSE-NEXT:    imull $-109, %eax, %ecx
469; SSE-NEXT:    shrl $8, %ecx
470; SSE-NEXT:    addb %al, %cl
471; SSE-NEXT:    movb %cl, %al
472; SSE-NEXT:    shrb $7, %al
473; SSE-NEXT:    sarb $2, %cl
474; SSE-NEXT:    addb %al, %cl
475; SSE-NEXT:    movzbl %cl, %eax
476; SSE-NEXT:    movd %eax, %xmm0
477; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %r14d
478; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %edx
479; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %r9d
480; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
481; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %r11d
482; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %ecx
483; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %r8d
484; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %esi
485; SSE-NEXT:    imull $-109, %esi, %edi
486; SSE-NEXT:    shrl $8, %edi
487; SSE-NEXT:    addb %sil, %dil
488; SSE-NEXT:    movb %dil, %bl
489; SSE-NEXT:    shrb $7, %bl
490; SSE-NEXT:    sarb $2, %dil
491; SSE-NEXT:    addb %bl, %dil
492; SSE-NEXT:    movzbl %dil, %esi
493; SSE-NEXT:    movd %esi, %xmm1
494; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
495; SSE-NEXT:    imull $-109, %eax, %esi
496; SSE-NEXT:    shrl $8, %esi
497; SSE-NEXT:    addb %al, %sil
498; SSE-NEXT:    movb %sil, %al
499; SSE-NEXT:    shrb $7, %al
500; SSE-NEXT:    sarb $2, %sil
501; SSE-NEXT:    addb %al, %sil
502; SSE-NEXT:    movzbl %sil, %eax
503; SSE-NEXT:    movd %eax, %xmm2
504; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %ebp
505; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %esi
506; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %r10d
507; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %edi
508; SSE-NEXT:    imull $-109, %edi, %ebx
509; SSE-NEXT:    shrl $8, %ebx
510; SSE-NEXT:    addb %dil, %bl
511; SSE-NEXT:    movb %bl, %al
512; SSE-NEXT:    shrb $7, %al
513; SSE-NEXT:    sarb $2, %bl
514; SSE-NEXT:    addb %al, %bl
515; SSE-NEXT:    movzbl %bl, %eax
516; SSE-NEXT:    movd %eax, %xmm0
517; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
518; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
519; SSE-NEXT:    imull $-109, %edx, %eax
520; SSE-NEXT:    shrl $8, %eax
521; SSE-NEXT:    addb %dl, %al
522; SSE-NEXT:    movb %al, %dl
523; SSE-NEXT:    shrb $7, %dl
524; SSE-NEXT:    sarb $2, %al
525; SSE-NEXT:    addb %dl, %al
526; SSE-NEXT:    movzbl %al, %eax
527; SSE-NEXT:    movd %eax, %xmm1
528; SSE-NEXT:    imull $-109, %esi, %eax
529; SSE-NEXT:    shrl $8, %eax
530; SSE-NEXT:    addb %sil, %al
531; SSE-NEXT:    movb %al, %dl
532; SSE-NEXT:    shrb $7, %dl
533; SSE-NEXT:    sarb $2, %al
534; SSE-NEXT:    addb %dl, %al
535; SSE-NEXT:    movzbl %al, %eax
536; SSE-NEXT:    movd %eax, %xmm2
537; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
538; SSE-NEXT:    imull $-109, %ecx, %eax
539; SSE-NEXT:    shrl $8, %eax
540; SSE-NEXT:    addb %cl, %al
541; SSE-NEXT:    movb %al, %cl
542; SSE-NEXT:    shrb $7, %cl
543; SSE-NEXT:    sarb $2, %al
544; SSE-NEXT:    addb %cl, %al
545; SSE-NEXT:    movzbl %al, %eax
546; SSE-NEXT:    movd %eax, %xmm3
547; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %ecx
548; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
549; SSE-NEXT:    imull $-109, %eax, %edx
550; SSE-NEXT:    shrl $8, %edx
551; SSE-NEXT:    addb %al, %dl
552; SSE-NEXT:    movb %dl, %al
553; SSE-NEXT:    shrb $7, %al
554; SSE-NEXT:    sarb $2, %dl
555; SSE-NEXT:    addb %al, %dl
556; SSE-NEXT:    movzbl %dl, %eax
557; SSE-NEXT:    movd %eax, %xmm1
558; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
559; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
560; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
561; SSE-NEXT:    imull $-109, %r14d, %eax
562; SSE-NEXT:    shrl $8, %eax
563; SSE-NEXT:    addb %r14b, %al
564; SSE-NEXT:    movb %al, %dl
565; SSE-NEXT:    shrb $7, %dl
566; SSE-NEXT:    sarb $2, %al
567; SSE-NEXT:    addb %dl, %al
568; SSE-NEXT:    movzbl %al, %eax
569; SSE-NEXT:    movd %eax, %xmm2
570; SSE-NEXT:    imull $-109, %ebp, %eax
571; SSE-NEXT:    shrl $8, %eax
572; SSE-NEXT:    addb %bpl, %al
573; SSE-NEXT:    movb %al, %dl
574; SSE-NEXT:    shrb $7, %dl
575; SSE-NEXT:    sarb $2, %al
576; SSE-NEXT:    addb %dl, %al
577; SSE-NEXT:    movzbl %al, %eax
578; SSE-NEXT:    movd %eax, %xmm0
579; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
580; SSE-NEXT:    imull $-109, %r11d, %eax
581; SSE-NEXT:    shrl $8, %eax
582; SSE-NEXT:    addb %r11b, %al
583; SSE-NEXT:    movb %al, %dl
584; SSE-NEXT:    shrb $7, %dl
585; SSE-NEXT:    sarb $2, %al
586; SSE-NEXT:    addb %dl, %al
587; SSE-NEXT:    movzbl %al, %eax
588; SSE-NEXT:    movd %eax, %xmm3
589; SSE-NEXT:    imull $-109, %ecx, %eax
590; SSE-NEXT:    shrl $8, %eax
591; SSE-NEXT:    addb %cl, %al
592; SSE-NEXT:    movb %al, %cl
593; SSE-NEXT:    shrb $7, %cl
594; SSE-NEXT:    sarb $2, %al
595; SSE-NEXT:    addb %cl, %al
596; SSE-NEXT:    movzbl %al, %eax
597; SSE-NEXT:    movd %eax, %xmm2
598; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
599; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
600; SSE-NEXT:    imull $-109, %r9d, %eax
601; SSE-NEXT:    shrl $8, %eax
602; SSE-NEXT:    addb %r9b, %al
603; SSE-NEXT:    movb %al, %cl
604; SSE-NEXT:    shrb $7, %cl
605; SSE-NEXT:    sarb $2, %al
606; SSE-NEXT:    addb %cl, %al
607; SSE-NEXT:    movzbl %al, %eax
608; SSE-NEXT:    movd %eax, %xmm0
609; SSE-NEXT:    imull $-109, %r10d, %eax
610; SSE-NEXT:    shrl $8, %eax
611; SSE-NEXT:    addb %r10b, %al
612; SSE-NEXT:    movb %al, %cl
613; SSE-NEXT:    shrb $7, %cl
614; SSE-NEXT:    sarb $2, %al
615; SSE-NEXT:    addb %cl, %al
616; SSE-NEXT:    movzbl %al, %eax
617; SSE-NEXT:    movd %eax, %xmm3
618; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
619; SSE-NEXT:    imull $-109, %r8d, %eax
620; SSE-NEXT:    shrl $8, %eax
621; SSE-NEXT:    addb %r8b, %al
622; SSE-NEXT:    movb %al, %cl
623; SSE-NEXT:    shrb $7, %cl
624; SSE-NEXT:    sarb $2, %al
625; SSE-NEXT:    addb %cl, %al
626; SSE-NEXT:    movzbl %al, %eax
627; SSE-NEXT:    movd %eax, %xmm4
628; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
629; SSE-NEXT:    imull $-109, %eax, %ecx
630; SSE-NEXT:    shrl $8, %ecx
631; SSE-NEXT:    addb %al, %cl
632; SSE-NEXT:    movb %cl, %al
633; SSE-NEXT:    shrb $7, %al
634; SSE-NEXT:    sarb $2, %cl
635; SSE-NEXT:    addb %al, %cl
636; SSE-NEXT:    movzbl %cl, %eax
637; SSE-NEXT:    movd %eax, %xmm0
638; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
639; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
640; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
641; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
642; SSE-NEXT:    popq %rbx
643; SSE-NEXT:    popq %r14
644; SSE-NEXT:    popq %rbp
645; SSE-NEXT:    retq
646;
647; AVX-LABEL: test7:
648; AVX:       # BB#0:
649; AVX-NEXT:    vpextrb $1, %xmm0, %eax
650; AVX-NEXT:    movsbl %al, %eax
651; AVX-NEXT:    imull $-109, %eax, %ecx
652; AVX-NEXT:    shrl $8, %ecx
653; AVX-NEXT:    addb %cl, %al
654; AVX-NEXT:    movb %al, %cl
655; AVX-NEXT:    shrb $7, %cl
656; AVX-NEXT:    sarb $2, %al
657; AVX-NEXT:    addb %cl, %al
658; AVX-NEXT:    movzbl %al, %eax
659; AVX-NEXT:    vpextrb $0, %xmm0, %ecx
660; AVX-NEXT:    movsbl %cl, %ecx
661; AVX-NEXT:    imull $-109, %ecx, %edx
662; AVX-NEXT:    shrl $8, %edx
663; AVX-NEXT:    addb %dl, %cl
664; AVX-NEXT:    movb %cl, %dl
665; AVX-NEXT:    shrb $7, %dl
666; AVX-NEXT:    sarb $2, %cl
667; AVX-NEXT:    addb %dl, %cl
668; AVX-NEXT:    movzbl %cl, %ecx
669; AVX-NEXT:    vmovd %ecx, %xmm1
670; AVX-NEXT:    vpextrb $2, %xmm0, %ecx
671; AVX-NEXT:    movsbl %cl, %ecx
672; AVX-NEXT:    imull $-109, %ecx, %edx
673; AVX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
674; AVX-NEXT:    shrl $8, %edx
675; AVX-NEXT:    addb %dl, %cl
676; AVX-NEXT:    movb %cl, %al
677; AVX-NEXT:    shrb $7, %al
678; AVX-NEXT:    sarb $2, %cl
679; AVX-NEXT:    addb %al, %cl
680; AVX-NEXT:    movzbl %cl, %eax
681; AVX-NEXT:    vpextrb $3, %xmm0, %ecx
682; AVX-NEXT:    movsbl %cl, %ecx
683; AVX-NEXT:    imull $-109, %ecx, %edx
684; AVX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
685; AVX-NEXT:    shrl $8, %edx
686; AVX-NEXT:    addb %dl, %cl
687; AVX-NEXT:    movb %cl, %al
688; AVX-NEXT:    shrb $7, %al
689; AVX-NEXT:    sarb $2, %cl
690; AVX-NEXT:    addb %al, %cl
691; AVX-NEXT:    movzbl %cl, %eax
692; AVX-NEXT:    vpextrb $4, %xmm0, %ecx
693; AVX-NEXT:    movsbl %cl, %ecx
694; AVX-NEXT:    imull $-109, %ecx, %edx
695; AVX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
696; AVX-NEXT:    shrl $8, %edx
697; AVX-NEXT:    addb %dl, %cl
698; AVX-NEXT:    movb %cl, %al
699; AVX-NEXT:    shrb $7, %al
700; AVX-NEXT:    sarb $2, %cl
701; AVX-NEXT:    addb %al, %cl
702; AVX-NEXT:    movzbl %cl, %eax
703; AVX-NEXT:    vpextrb $5, %xmm0, %ecx
704; AVX-NEXT:    movsbl %cl, %ecx
705; AVX-NEXT:    imull $-109, %ecx, %edx
706; AVX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
707; AVX-NEXT:    shrl $8, %edx
708; AVX-NEXT:    addb %dl, %cl
709; AVX-NEXT:    movb %cl, %al
710; AVX-NEXT:    shrb $7, %al
711; AVX-NEXT:    sarb $2, %cl
712; AVX-NEXT:    addb %al, %cl
713; AVX-NEXT:    movzbl %cl, %eax
714; AVX-NEXT:    vpextrb $6, %xmm0, %ecx
715; AVX-NEXT:    movsbl %cl, %ecx
716; AVX-NEXT:    imull $-109, %ecx, %edx
717; AVX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
718; AVX-NEXT:    shrl $8, %edx
719; AVX-NEXT:    addb %dl, %cl
720; AVX-NEXT:    movb %cl, %al
721; AVX-NEXT:    shrb $7, %al
722; AVX-NEXT:    sarb $2, %cl
723; AVX-NEXT:    addb %al, %cl
724; AVX-NEXT:    movzbl %cl, %eax
725; AVX-NEXT:    vpextrb $7, %xmm0, %ecx
726; AVX-NEXT:    movsbl %cl, %ecx
727; AVX-NEXT:    imull $-109, %ecx, %edx
728; AVX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
729; AVX-NEXT:    shrl $8, %edx
730; AVX-NEXT:    addb %dl, %cl
731; AVX-NEXT:    movb %cl, %al
732; AVX-NEXT:    shrb $7, %al
733; AVX-NEXT:    sarb $2, %cl
734; AVX-NEXT:    addb %al, %cl
735; AVX-NEXT:    movzbl %cl, %eax
736; AVX-NEXT:    vpextrb $8, %xmm0, %ecx
737; AVX-NEXT:    movsbl %cl, %ecx
738; AVX-NEXT:    imull $-109, %ecx, %edx
739; AVX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
740; AVX-NEXT:    shrl $8, %edx
741; AVX-NEXT:    addb %dl, %cl
742; AVX-NEXT:    movb %cl, %al
743; AVX-NEXT:    shrb $7, %al
744; AVX-NEXT:    sarb $2, %cl
745; AVX-NEXT:    addb %al, %cl
746; AVX-NEXT:    movzbl %cl, %eax
747; AVX-NEXT:    vpextrb $9, %xmm0, %ecx
748; AVX-NEXT:    movsbl %cl, %ecx
749; AVX-NEXT:    imull $-109, %ecx, %edx
750; AVX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
751; AVX-NEXT:    shrl $8, %edx
752; AVX-NEXT:    addb %dl, %cl
753; AVX-NEXT:    movb %cl, %al
754; AVX-NEXT:    shrb $7, %al
755; AVX-NEXT:    sarb $2, %cl
756; AVX-NEXT:    addb %al, %cl
757; AVX-NEXT:    movzbl %cl, %eax
758; AVX-NEXT:    vpextrb $10, %xmm0, %ecx
759; AVX-NEXT:    movsbl %cl, %ecx
760; AVX-NEXT:    imull $-109, %ecx, %edx
761; AVX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
762; AVX-NEXT:    shrl $8, %edx
763; AVX-NEXT:    addb %dl, %cl
764; AVX-NEXT:    movb %cl, %al
765; AVX-NEXT:    shrb $7, %al
766; AVX-NEXT:    sarb $2, %cl
767; AVX-NEXT:    addb %al, %cl
768; AVX-NEXT:    movzbl %cl, %eax
769; AVX-NEXT:    vpextrb $11, %xmm0, %ecx
770; AVX-NEXT:    movsbl %cl, %ecx
771; AVX-NEXT:    imull $-109, %ecx, %edx
772; AVX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
773; AVX-NEXT:    shrl $8, %edx
774; AVX-NEXT:    addb %dl, %cl
775; AVX-NEXT:    movb %cl, %al
776; AVX-NEXT:    shrb $7, %al
777; AVX-NEXT:    sarb $2, %cl
778; AVX-NEXT:    addb %al, %cl
779; AVX-NEXT:    movzbl %cl, %eax
780; AVX-NEXT:    vpextrb $12, %xmm0, %ecx
781; AVX-NEXT:    movsbl %cl, %ecx
782; AVX-NEXT:    imull $-109, %ecx, %edx
783; AVX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
784; AVX-NEXT:    shrl $8, %edx
785; AVX-NEXT:    addb %dl, %cl
786; AVX-NEXT:    movb %cl, %al
787; AVX-NEXT:    shrb $7, %al
788; AVX-NEXT:    sarb $2, %cl
789; AVX-NEXT:    addb %al, %cl
790; AVX-NEXT:    movzbl %cl, %eax
791; AVX-NEXT:    vpextrb $13, %xmm0, %ecx
792; AVX-NEXT:    movsbl %cl, %ecx
793; AVX-NEXT:    imull $-109, %ecx, %edx
794; AVX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
795; AVX-NEXT:    shrl $8, %edx
796; AVX-NEXT:    addb %dl, %cl
797; AVX-NEXT:    movb %cl, %al
798; AVX-NEXT:    shrb $7, %al
799; AVX-NEXT:    sarb $2, %cl
800; AVX-NEXT:    addb %al, %cl
801; AVX-NEXT:    movzbl %cl, %eax
802; AVX-NEXT:    vpextrb $14, %xmm0, %ecx
803; AVX-NEXT:    movsbl %cl, %ecx
804; AVX-NEXT:    imull $-109, %ecx, %edx
805; AVX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
806; AVX-NEXT:    shrl $8, %edx
807; AVX-NEXT:    addb %dl, %cl
808; AVX-NEXT:    movb %cl, %al
809; AVX-NEXT:    shrb $7, %al
810; AVX-NEXT:    sarb $2, %cl
811; AVX-NEXT:    addb %al, %cl
812; AVX-NEXT:    movzbl %cl, %eax
813; AVX-NEXT:    vpextrb $15, %xmm0, %ecx
814; AVX-NEXT:    movsbl %cl, %ecx
815; AVX-NEXT:    imull $-109, %ecx, %edx
816; AVX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm0
817; AVX-NEXT:    shrl $8, %edx
818; AVX-NEXT:    addb %dl, %cl
819; AVX-NEXT:    movb %cl, %al
820; AVX-NEXT:    shrb $7, %al
821; AVX-NEXT:    sarb $2, %cl
822; AVX-NEXT:    addb %al, %cl
823; AVX-NEXT:    movzbl %cl, %eax
824; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
825; AVX-NEXT:    retq
826  %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
827  ret <16 x i8> %div
828}
829
830define <4 x i32> @test8(<4 x i32> %a) #0 {
831; SSE41-LABEL: test8:
832; SSE41:       # BB#0:
833; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
834; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
835; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
836; SSE41-NEXT:    pmuldq %xmm2, %xmm3
837; SSE41-NEXT:    pmuldq %xmm0, %xmm1
838; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
839; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
840; SSE41-NEXT:    paddd %xmm0, %xmm1
841; SSE41-NEXT:    movdqa %xmm1, %xmm0
842; SSE41-NEXT:    psrld $31, %xmm0
843; SSE41-NEXT:    psrad $2, %xmm1
844; SSE41-NEXT:    paddd %xmm0, %xmm1
845; SSE41-NEXT:    movdqa %xmm1, %xmm0
846; SSE41-NEXT:    retq
847;
848; SSE-LABEL: test8:
849; SSE:       # BB#0:
850; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
851; SSE-NEXT:    movdqa %xmm0, %xmm2
852; SSE-NEXT:    psrad $31, %xmm2
853; SSE-NEXT:    pand %xmm1, %xmm2
854; SSE-NEXT:    movdqa %xmm0, %xmm3
855; SSE-NEXT:    pmuludq %xmm1, %xmm3
856; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
857; SSE-NEXT:    psrad $31, %xmm1
858; SSE-NEXT:    pand %xmm0, %xmm1
859; SSE-NEXT:    paddd %xmm1, %xmm2
860; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
861; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
862; SSE-NEXT:    pmuludq %xmm4, %xmm3
863; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
864; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
865; SSE-NEXT:    psubd %xmm2, %xmm1
866; SSE-NEXT:    paddd %xmm0, %xmm1
867; SSE-NEXT:    movdqa %xmm1, %xmm0
868; SSE-NEXT:    psrld $31, %xmm0
869; SSE-NEXT:    psrad $2, %xmm1
870; SSE-NEXT:    paddd %xmm0, %xmm1
871; SSE-NEXT:    movdqa %xmm1, %xmm0
872; SSE-NEXT:    retq
873;
874; AVX-LABEL: test8:
875; AVX:       # BB#0:
876; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
877; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
878; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
879; AVX-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
880; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
881; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
882; AVX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
883; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
884; AVX-NEXT:    vpsrld $31, %xmm0, %xmm1
885; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
886; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
887; AVX-NEXT:    retq
888  %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
889  ret <4 x i32> %div
890}
891
892define <8 x i32> @test9(<8 x i32> %a) #0 {
893; SSE41-LABEL: test9:
894; SSE41:       # BB#0:
895; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
896; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
897; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
898; SSE41-NEXT:    pmuldq %xmm4, %xmm5
899; SSE41-NEXT:    movdqa %xmm0, %xmm2
900; SSE41-NEXT:    pmuldq %xmm3, %xmm2
901; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
902; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
903; SSE41-NEXT:    paddd %xmm0, %xmm2
904; SSE41-NEXT:    movdqa %xmm2, %xmm0
905; SSE41-NEXT:    psrld $31, %xmm0
906; SSE41-NEXT:    psrad $2, %xmm2
907; SSE41-NEXT:    paddd %xmm0, %xmm2
908; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
909; SSE41-NEXT:    pmuldq %xmm4, %xmm0
910; SSE41-NEXT:    pmuldq %xmm1, %xmm3
911; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
912; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
913; SSE41-NEXT:    paddd %xmm1, %xmm3
914; SSE41-NEXT:    movdqa %xmm3, %xmm0
915; SSE41-NEXT:    psrld $31, %xmm0
916; SSE41-NEXT:    psrad $2, %xmm3
917; SSE41-NEXT:    paddd %xmm0, %xmm3
918; SSE41-NEXT:    movdqa %xmm2, %xmm0
919; SSE41-NEXT:    movdqa %xmm3, %xmm1
920; SSE41-NEXT:    retq
921;
922; SSE-LABEL: test9:
923; SSE:       # BB#0:
924; SSE-NEXT:    movdqa %xmm0, %xmm2
925; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
926; SSE-NEXT:    movdqa %xmm3, %xmm4
927; SSE-NEXT:    psrad $31, %xmm4
928; SSE-NEXT:    movdqa %xmm4, %xmm0
929; SSE-NEXT:    pand %xmm2, %xmm0
930; SSE-NEXT:    movdqa %xmm2, %xmm5
931; SSE-NEXT:    psrad $31, %xmm5
932; SSE-NEXT:    pand %xmm3, %xmm5
933; SSE-NEXT:    paddd %xmm0, %xmm5
934; SSE-NEXT:    movdqa %xmm2, %xmm0
935; SSE-NEXT:    pmuludq %xmm3, %xmm0
936; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
937; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
938; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
939; SSE-NEXT:    pmuludq %xmm6, %xmm7
940; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
941; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
942; SSE-NEXT:    psubd %xmm5, %xmm0
943; SSE-NEXT:    paddd %xmm2, %xmm0
944; SSE-NEXT:    movdqa %xmm0, %xmm2
945; SSE-NEXT:    psrld $31, %xmm2
946; SSE-NEXT:    psrad $2, %xmm0
947; SSE-NEXT:    paddd %xmm2, %xmm0
948; SSE-NEXT:    pand %xmm1, %xmm4
949; SSE-NEXT:    movdqa %xmm1, %xmm5
950; SSE-NEXT:    psrad $31, %xmm5
951; SSE-NEXT:    pand %xmm3, %xmm5
952; SSE-NEXT:    paddd %xmm4, %xmm5
953; SSE-NEXT:    pmuludq %xmm1, %xmm3
954; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
955; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
956; SSE-NEXT:    pmuludq %xmm6, %xmm3
957; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
958; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
959; SSE-NEXT:    psubd %xmm5, %xmm2
960; SSE-NEXT:    paddd %xmm1, %xmm2
961; SSE-NEXT:    movdqa %xmm2, %xmm1
962; SSE-NEXT:    psrld $31, %xmm1
963; SSE-NEXT:    psrad $2, %xmm2
964; SSE-NEXT:    paddd %xmm1, %xmm2
965; SSE-NEXT:    movdqa %xmm2, %xmm1
966; SSE-NEXT:    retq
967;
968; AVX-LABEL: test9:
969; AVX:       # BB#0:
970; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
971; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
972; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
973; AVX-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
974; AVX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
975; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
976; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
977; AVX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
978; AVX-NEXT:    vpsrld $31, %ymm0, %ymm1
979; AVX-NEXT:    vpsrad $2, %ymm0, %ymm0
980; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
981; AVX-NEXT:    retq
982  %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
983  ret <8 x i32> %div
984}
985
986define <8 x i32> @test10(<8 x i32> %a) #0 {
987; SSE41-LABEL: test10:
988; SSE41:       # BB#0:
989; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
990; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
991; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
992; SSE41-NEXT:    pmuludq %xmm3, %xmm4
993; SSE41-NEXT:    movdqa %xmm0, %xmm5
994; SSE41-NEXT:    pmuludq %xmm2, %xmm5
995; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
996; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
997; SSE41-NEXT:    movdqa %xmm0, %xmm4
998; SSE41-NEXT:    psubd %xmm5, %xmm4
999; SSE41-NEXT:    psrld $1, %xmm4
1000; SSE41-NEXT:    paddd %xmm5, %xmm4
1001; SSE41-NEXT:    psrld $2, %xmm4
1002; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7]
1003; SSE41-NEXT:    pmulld %xmm5, %xmm4
1004; SSE41-NEXT:    psubd %xmm4, %xmm0
1005; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1006; SSE41-NEXT:    pmuludq %xmm3, %xmm4
1007; SSE41-NEXT:    pmuludq %xmm1, %xmm2
1008; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1009; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1010; SSE41-NEXT:    movdqa %xmm1, %xmm3
1011; SSE41-NEXT:    psubd %xmm2, %xmm3
1012; SSE41-NEXT:    psrld $1, %xmm3
1013; SSE41-NEXT:    paddd %xmm2, %xmm3
1014; SSE41-NEXT:    psrld $2, %xmm3
1015; SSE41-NEXT:    pmulld %xmm5, %xmm3
1016; SSE41-NEXT:    psubd %xmm3, %xmm1
1017; SSE41-NEXT:    retq
1018;
1019; SSE-LABEL: test10:
1020; SSE:       # BB#0:
1021; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
1022; SSE-NEXT:    movdqa %xmm0, %xmm2
1023; SSE-NEXT:    pmuludq %xmm3, %xmm2
1024; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
1025; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1026; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
1027; SSE-NEXT:    pmuludq %xmm4, %xmm5
1028; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1029; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1030; SSE-NEXT:    movdqa %xmm0, %xmm5
1031; SSE-NEXT:    psubd %xmm2, %xmm5
1032; SSE-NEXT:    psrld $1, %xmm5
1033; SSE-NEXT:    paddd %xmm2, %xmm5
1034; SSE-NEXT:    psrld $2, %xmm5
1035; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,7,7,7]
1036; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
1037; SSE-NEXT:    pmuludq %xmm2, %xmm5
1038; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1039; SSE-NEXT:    pmuludq %xmm2, %xmm6
1040; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1041; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1042; SSE-NEXT:    psubd %xmm5, %xmm0
1043; SSE-NEXT:    pmuludq %xmm1, %xmm3
1044; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
1045; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
1046; SSE-NEXT:    pmuludq %xmm4, %xmm5
1047; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3]
1048; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1049; SSE-NEXT:    movdqa %xmm1, %xmm4
1050; SSE-NEXT:    psubd %xmm3, %xmm4
1051; SSE-NEXT:    psrld $1, %xmm4
1052; SSE-NEXT:    paddd %xmm3, %xmm4
1053; SSE-NEXT:    psrld $2, %xmm4
1054; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
1055; SSE-NEXT:    pmuludq %xmm2, %xmm4
1056; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1057; SSE-NEXT:    pmuludq %xmm2, %xmm3
1058; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1059; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1060; SSE-NEXT:    psubd %xmm4, %xmm1
1061; SSE-NEXT:    retq
1062;
1063; AVX-LABEL: test10:
1064; AVX:       # BB#0:
1065; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
1066; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
1067; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
1068; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
1069; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
1070; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
1071; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
1072; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
1073; AVX-NEXT:    vpsrld $1, %ymm2, %ymm2
1074; AVX-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
1075; AVX-NEXT:    vpsrld $2, %ymm1, %ymm1
1076; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
1077; AVX-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
1078; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
1079; AVX-NEXT:    retq
1080  %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
1081  ret <8 x i32> %rem
1082}
1083
1084define <8 x i32> @test11(<8 x i32> %a) #0 {
1085; SSE41-LABEL: test11:
1086; SSE41:       # BB#0:
1087; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
1088; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1089; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1090; SSE41-NEXT:    pmuldq %xmm3, %xmm4
1091; SSE41-NEXT:    movdqa %xmm0, %xmm5
1092; SSE41-NEXT:    pmuldq %xmm2, %xmm5
1093; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1094; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
1095; SSE41-NEXT:    paddd %xmm0, %xmm5
1096; SSE41-NEXT:    movdqa %xmm5, %xmm4
1097; SSE41-NEXT:    psrld $31, %xmm4
1098; SSE41-NEXT:    psrad $2, %xmm5
1099; SSE41-NEXT:    paddd %xmm4, %xmm5
1100; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7]
1101; SSE41-NEXT:    pmulld %xmm4, %xmm5
1102; SSE41-NEXT:    psubd %xmm5, %xmm0
1103; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
1104; SSE41-NEXT:    pmuldq %xmm3, %xmm5
1105; SSE41-NEXT:    pmuldq %xmm1, %xmm2
1106; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1107; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
1108; SSE41-NEXT:    paddd %xmm1, %xmm2
1109; SSE41-NEXT:    movdqa %xmm2, %xmm3
1110; SSE41-NEXT:    psrld $31, %xmm3
1111; SSE41-NEXT:    psrad $2, %xmm2
1112; SSE41-NEXT:    paddd %xmm3, %xmm2
1113; SSE41-NEXT:    pmulld %xmm4, %xmm2
1114; SSE41-NEXT:    psubd %xmm2, %xmm1
1115; SSE41-NEXT:    retq
1116;
1117; SSE-LABEL: test11:
1118; SSE:       # BB#0:
1119; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
1120; SSE-NEXT:    movdqa %xmm2, %xmm3
1121; SSE-NEXT:    psrad $31, %xmm3
1122; SSE-NEXT:    movdqa %xmm3, %xmm4
1123; SSE-NEXT:    pand %xmm0, %xmm4
1124; SSE-NEXT:    movdqa %xmm0, %xmm6
1125; SSE-NEXT:    psrad $31, %xmm6
1126; SSE-NEXT:    pand %xmm2, %xmm6
1127; SSE-NEXT:    paddd %xmm4, %xmm6
1128; SSE-NEXT:    movdqa %xmm0, %xmm4
1129; SSE-NEXT:    pmuludq %xmm2, %xmm4
1130; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3]
1131; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
1132; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1133; SSE-NEXT:    pmuludq %xmm5, %xmm4
1134; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
1135; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
1136; SSE-NEXT:    psubd %xmm6, %xmm7
1137; SSE-NEXT:    paddd %xmm0, %xmm7
1138; SSE-NEXT:    movdqa %xmm7, %xmm4
1139; SSE-NEXT:    psrld $31, %xmm4
1140; SSE-NEXT:    psrad $2, %xmm7
1141; SSE-NEXT:    paddd %xmm4, %xmm7
1142; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7]
1143; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
1144; SSE-NEXT:    pmuludq %xmm4, %xmm7
1145; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1146; SSE-NEXT:    pmuludq %xmm4, %xmm6
1147; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1148; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1149; SSE-NEXT:    psubd %xmm7, %xmm0
1150; SSE-NEXT:    pand %xmm1, %xmm3
1151; SSE-NEXT:    movdqa %xmm1, %xmm6
1152; SSE-NEXT:    psrad $31, %xmm6
1153; SSE-NEXT:    pand %xmm2, %xmm6
1154; SSE-NEXT:    paddd %xmm3, %xmm6
1155; SSE-NEXT:    pmuludq %xmm1, %xmm2
1156; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
1157; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1158; SSE-NEXT:    pmuludq %xmm5, %xmm3
1159; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
1160; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1161; SSE-NEXT:    psubd %xmm6, %xmm2
1162; SSE-NEXT:    paddd %xmm1, %xmm2
1163; SSE-NEXT:    movdqa %xmm2, %xmm3
1164; SSE-NEXT:    psrld $31, %xmm3
1165; SSE-NEXT:    psrad $2, %xmm2
1166; SSE-NEXT:    paddd %xmm3, %xmm2
1167; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1168; SSE-NEXT:    pmuludq %xmm4, %xmm2
1169; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1170; SSE-NEXT:    pmuludq %xmm4, %xmm3
1171; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1172; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1173; SSE-NEXT:    psubd %xmm2, %xmm1
1174; SSE-NEXT:    retq
1175;
1176; AVX-LABEL: test11:
1177; AVX:       # BB#0:
1178; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
1179; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
1180; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
1181; AVX-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
1182; AVX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
1183; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
1184; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
1185; AVX-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
1186; AVX-NEXT:    vpsrld $31, %ymm1, %ymm2
1187; AVX-NEXT:    vpsrad $2, %ymm1, %ymm1
1188; AVX-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
1189; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
1190; AVX-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
1191; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
1192; AVX-NEXT:    retq
1193  %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
1194  ret <8 x i32> %rem
1195}
1196
1197define <2 x i16> @test12() #0 {
1198; SSE41-LABEL: test12:
1199; SSE41:       # BB#0:
1200; SSE41-NEXT:    xorps %xmm0, %xmm0
1201; SSE41-NEXT:    retq
1202;
1203; SSE-LABEL: test12:
1204; SSE:       # BB#0:
1205; SSE-NEXT:    xorps %xmm0, %xmm0
1206; SSE-NEXT:    retq
1207;
1208; AVX-LABEL: test12:
1209; AVX:       # BB#0:
1210; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1211; AVX-NEXT:    retq
1212  %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
1213  %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1
1214  %B9 = urem <2 x i16> %I9, %I9
1215  ret <2 x i16> %B9
1216}
1217
1218define <4 x i32> @PR20355(<4 x i32> %a) #0 {
1219; SSE41-LABEL: PR20355:
1220; SSE41:       # BB#0: # %entry
1221; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
1222; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1223; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1224; SSE41-NEXT:    pmuldq %xmm2, %xmm3
1225; SSE41-NEXT:    pmuldq %xmm1, %xmm0
1226; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1227; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1228; SSE41-NEXT:    movdqa %xmm1, %xmm0
1229; SSE41-NEXT:    psrld $31, %xmm0
1230; SSE41-NEXT:    paddd %xmm1, %xmm0
1231; SSE41-NEXT:    retq
1232;
1233; SSE-LABEL: PR20355:
1234; SSE:       # BB#0: # %entry
1235; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
1236; SSE-NEXT:    movdqa %xmm1, %xmm2
1237; SSE-NEXT:    psrad $31, %xmm2
1238; SSE-NEXT:    pand %xmm0, %xmm2
1239; SSE-NEXT:    movdqa %xmm0, %xmm3
1240; SSE-NEXT:    psrad $31, %xmm3
1241; SSE-NEXT:    pand %xmm1, %xmm3
1242; SSE-NEXT:    paddd %xmm2, %xmm3
1243; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1244; SSE-NEXT:    pmuludq %xmm1, %xmm0
1245; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
1246; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
1247; SSE-NEXT:    pmuludq %xmm2, %xmm0
1248; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1249; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
1250; SSE-NEXT:    psubd %xmm3, %xmm4
1251; SSE-NEXT:    movdqa %xmm4, %xmm0
1252; SSE-NEXT:    psrld $31, %xmm0
1253; SSE-NEXT:    paddd %xmm4, %xmm0
1254; SSE-NEXT:    retq
1255;
1256; AVX-LABEL: PR20355:
1257; AVX:       # BB#0: # %entry
1258; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
1259; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1260; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1261; AVX-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
1262; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
1263; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1264; AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
1265; AVX-NEXT:    vpsrld $31, %xmm0, %xmm1
1266; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1267; AVX-NEXT:    retq
1268entry:
1269  %sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
1270  ret <4 x i32> %sdiv
1271}
1272
1273attributes #0 = { nounwind }
1274