1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
4
5; Check that under certain conditions we can factor out a rotate
6; from the following idioms:
7;   (a*c0) >> s1 | (a*c1)
8;   (a/c0) << s1 | (a/c1)
9; This targets cases where instcombine has folded a shl/srl/mul/udiv
10; with one of the shifts from the rotate idiom
11
12define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
13; CHECK-LABEL: vroll_v4i32_extract_shl:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vpslld $3, %xmm0, %xmm0
16; CHECK-NEXT:    vprold $7, %zmm0, %zmm0
17; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
18; CHECK-NEXT:    vzeroupper
19; CHECK-NEXT:    ret{{[l|q]}}
20  %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
21  %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
22  %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25>
23  %out = or <4 x i32> %lhs_shift, %rhs_mul
24  ret <4 x i32> %out
25}
26
27define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
28; CHECK-LABEL: vrolq_v4i64_extract_shrl:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vpsrlq $5, %ymm0, %ymm0
31; CHECK-NEXT:    vprolq $29, %zmm0, %zmm0
32; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
33; CHECK-NEXT:    ret{{[l|q]}}
34  %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
35  %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
36  %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
37  %out = or <4 x i64> %lhs_div, %rhs_shift
38  ret <4 x i64> %out
39}
40
41define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
42; CHECK-LABEL: vroll_extract_mul:
43; CHECK:       # %bb.0:
44; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10]
45; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
46; CHECK-NEXT:    vprold $6, %zmm0, %zmm0
47; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
48; CHECK-NEXT:    ret{{[l|q]}}
49  %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
50  %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
51  %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
52  %out = or <8 x i32> %lhs_mul, %rhs_shift
53  ret <8 x i32> %out
54}
55
56define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
57; X86-LABEL: vrolq_extract_udiv:
58; X86:       # %bb.0:
59; X86-NEXT:    subl $44, %esp
60; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
61; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
62; X86-NEXT:    vmovss %xmm0, (%esp)
63; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
64; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
65; X86-NEXT:    calll __udivdi3
66; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
67; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
68; X86-NEXT:    vextractps $2, %xmm0, (%esp)
69; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
70; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
71; X86-NEXT:    vmovd %eax, %xmm0
72; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
73; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
74; X86-NEXT:    calll __udivdi3
75; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
76; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
77; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
78; X86-NEXT:    vprolq $57, %zmm0, %zmm0
79; X86-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
80; X86-NEXT:    addl $44, %esp
81; X86-NEXT:    vzeroupper
82; X86-NEXT:    retl
83;
84; X64-LABEL: vrolq_extract_udiv:
85; X64:       # %bb.0:
86; X64-NEXT:    vpextrq $1, %xmm0, %rax
87; X64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
88; X64-NEXT:    mulq %rcx
89; X64-NEXT:    vmovq %rdx, %xmm1
90; X64-NEXT:    vmovq %xmm0, %rax
91; X64-NEXT:    mulq %rcx
92; X64-NEXT:    vmovq %rdx, %xmm0
93; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
94; X64-NEXT:    vpsrlq $1, %xmm0, %xmm0
95; X64-NEXT:    vprolq $57, %zmm0, %zmm0
96; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
97; X64-NEXT:    vzeroupper
98; X64-NEXT:    retq
99  %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
100  %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
101  %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57>
102  %out = or <2 x i64> %lhs_shift, %rhs_div
103  ret <2 x i64> %out
104}
105
106define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
107; X86-LABEL: vrolw_extract_mul_with_mask:
108; X86:       # %bb.0:
109; X86-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
110; X86-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
111; X86-NEXT:    vprold $7, %zmm0, %zmm0
112; X86-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
113; X86-NEXT:    vzeroupper
114; X86-NEXT:    retl
115;
116; X64-LABEL: vrolw_extract_mul_with_mask:
117; X64:       # %bb.0:
118; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
119; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
120; X64-NEXT:    vprold $7, %zmm0, %zmm0
121; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
122; X64-NEXT:    vzeroupper
123; X64-NEXT:    retq
124  %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
125  %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
126  %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
127  %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25>
128  %out = or <4 x i32> %lhs_and, %rhs_shift
129  ret <4 x i32> %out
130}
131
132define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
133; X86-LABEL: illegal_no_extract_mul:
134; X86:       # %bb.0:
135; X86-NEXT:    vpmullw {{\.LCPI.*}}, %zmm0, %zmm1
136; X86-NEXT:    vpmullw {{\.LCPI.*}}, %zmm0, %zmm0
137; X86-NEXT:    vpsrlw $10, %zmm0, %zmm0
138; X86-NEXT:    vporq %zmm0, %zmm1, %zmm0
139; X86-NEXT:    retl
140;
141; X64-LABEL: illegal_no_extract_mul:
142; X64:       # %bb.0:
143; X64-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm1
144; X64-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm0
145; X64-NEXT:    vpsrlw $10, %zmm0, %zmm0
146; X64-NEXT:    vporq %zmm0, %zmm1, %zmm0
147; X64-NEXT:    retq
148  %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640>
149  %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
150  %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
151  %out = or <32 x i16> %lhs_mul, %rhs_shift
152  ret <32 x i16> %out
153}
154
155; Result would undershift
156define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
157; CHECK-LABEL: no_extract_shl:
158; CHECK:       # %bb.0:
159; CHECK-NEXT:    vpsllq $11, %ymm0, %ymm1
160; CHECK-NEXT:    vpsllq $24, %ymm0, %ymm0
161; CHECK-NEXT:    vpsrlq $50, %ymm1, %ymm1
162; CHECK-NEXT:    vpor %ymm0, %ymm1, %ymm0
163; CHECK-NEXT:    ret{{[l|q]}}
164  %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
165  %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
166  %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
167  %out = or <4 x i64> %lhs_shift, %rhs_mul
168  ret <4 x i64> %out
169}
170
171; Result would overshift
172define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
173; CHECK-LABEL: no_extract_shrl:
174; CHECK:       # %bb.0:
175; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840]
176; CHECK-NEXT:    vpslld $25, %xmm0, %xmm2
177; CHECK-NEXT:    vpand %xmm1, %xmm2, %xmm1
178; CHECK-NEXT:    vpsrld $9, %xmm0, %xmm0
179; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
180; CHECK-NEXT:    ret{{[l|q]}}
181  %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
182  %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
183  %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
184  %out = or <4 x i32> %lhs_shift, %rhs_div
185  ret <4 x i32> %out
186}
187
188; Can factor 512 from 1536, but result is 3 instead of 9
189define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
190; CHECK-LABEL: no_extract_mul:
191; CHECK:       # %bb.0:
192; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536]
193; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm1
194; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9]
195; CHECK-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
196; CHECK-NEXT:    vpsrld $23, %ymm0, %ymm0
197; CHECK-NEXT:    vpor %ymm0, %ymm1, %ymm0
198; CHECK-NEXT:    ret{{[l|q]}}
199  %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536>
200  %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
201  %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
202  %out = or <8 x i32> %lhs_mul, %rhs_shift
203  ret <8 x i32> %out
204}
205
206; Can't evenly factor 256 from 770
207define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
208; X86-LABEL: no_extract_udiv:
209; X86:       # %bb.0:
210; X86-NEXT:    subl $60, %esp
211; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
212; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
213; X86-NEXT:    vmovss %xmm0, (%esp)
214; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
215; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
216; X86-NEXT:    calll __udivdi3
217; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
218; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
219; X86-NEXT:    vextractps $2, %xmm0, (%esp)
220; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
221; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
222; X86-NEXT:    vmovd %eax, %xmm0
223; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
224; X86-NEXT:    calll __udivdi3
225; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
226; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
227; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
228; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
229; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
230; X86-NEXT:    vmovss %xmm0, (%esp)
231; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
232; X86-NEXT:    movl $770, {{[0-9]+}}(%esp) # imm = 0x302
233; X86-NEXT:    calll __udivdi3
234; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
235; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
236; X86-NEXT:    vextractps $2, %xmm0, (%esp)
237; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
238; X86-NEXT:    movl $770, {{[0-9]+}}(%esp) # imm = 0x302
239; X86-NEXT:    vmovd %eax, %xmm0
240; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
241; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
242; X86-NEXT:    calll __udivdi3
243; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
244; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
245; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
246; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
247; X86-NEXT:    vpsllq $56, %xmm1, %xmm1
248; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
249; X86-NEXT:    addl $60, %esp
250; X86-NEXT:    retl
251;
252; X64-LABEL: no_extract_udiv:
253; X64:       # %bb.0:
254; X64-NEXT:    vpextrq $1, %xmm0, %rcx
255; X64-NEXT:    movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
256; X64-NEXT:    movq %rcx, %rax
257; X64-NEXT:    mulq %rdi
258; X64-NEXT:    vmovq %rdx, %xmm1
259; X64-NEXT:    vmovq %xmm0, %rsi
260; X64-NEXT:    movq %rsi, %rax
261; X64-NEXT:    mulq %rdi
262; X64-NEXT:    vmovq %rdx, %xmm0
263; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
264; X64-NEXT:    vpsrlq $1, %xmm0, %xmm0
265; X64-NEXT:    movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
266; X64-NEXT:    movq %rcx, %rax
267; X64-NEXT:    mulq %rdi
268; X64-NEXT:    vmovq %rdx, %xmm1
269; X64-NEXT:    movq %rsi, %rax
270; X64-NEXT:    mulq %rdi
271; X64-NEXT:    vmovq %rdx, %xmm2
272; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
273; X64-NEXT:    vpsrlq $9, %xmm1, %xmm1
274; X64-NEXT:    vpsllq $56, %xmm0, %xmm0
275; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
276; X64-NEXT:    retq
277  %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
278  %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770>
279  %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56>
280  %out = or <2 x i64> %lhs_shift, %rhs_div
281  ret <2 x i64> %out
282}
283
284; DAGCombiner transforms shl X, 1 into add X, X.
285define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind {
286; CHECK-LABEL: extract_add_1:
287; CHECK:       # %bb.0:
288; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
289; CHECK-NEXT:    vprold $1, %zmm0, %zmm0
290; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
291; CHECK-NEXT:    vzeroupper
292; CHECK-NEXT:    ret{{[l|q]}}
293  %ii = add <4 x i32> %i, %i
294  %rhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
295  %out = or <4 x i32> %ii, %rhs
296  ret <4 x i32> %out
297}
298
299define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind {
300; CHECK-LABEL: extract_add_1_comut:
301; CHECK:       # %bb.0:
302; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
303; CHECK-NEXT:    vprold $1, %zmm0, %zmm0
304; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
305; CHECK-NEXT:    vzeroupper
306; CHECK-NEXT:    ret{{[l|q]}}
307  %ii = add <4 x i32> %i, %i
308  %lhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
309  %out = or <4 x i32> %lhs, %ii
310  ret <4 x i32> %out
311}
312
313define <4 x i32> @no_extract_add_1(<4 x i32> %i) nounwind {
314; CHECK-LABEL: no_extract_add_1:
315; CHECK:       # %bb.0:
316; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
317; CHECK-NEXT:    vpsrld $27, %xmm0, %xmm0
318; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
319; CHECK-NEXT:    ret{{[l|q]}}
320  %ii = add <4 x i32> %i, %i
321  %rhs = lshr <4 x i32> %i, <i32 27, i32 27, i32 27, i32 27>
322  %out = or <4 x i32> %ii, %rhs
323  ret <4 x i32> %out
324}
325