1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s
3
4; Source file looks something like this:
5;
6; typedef int AAA[100][100];
7;
8; void testCombineMultiplies(AAA a,int lll)
9; {
10;   int LOC = lll + 5;
11;
12;   a[LOC][LOC] = 11;
13;
14;   a[LOC][20] = 22;
15;   a[LOC+20][20] = 33;
16; }
17;
18; We want to make sure we don't generate 2 multiply instructions,
19; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
20; should combine the instructions in such a way to avoid the extra
21; multiply.
22;
23; Output looks roughly like this:
24;
25;	movl	8(%esp), %eax
26;	movl	12(%esp), %ecx
27;	imull	$400, %ecx, %edx        # imm = 0x190
28;	leal	(%edx,%eax), %esi
29;	movl	$11, 2020(%esi,%ecx,4)
30;	movl	$22, 2080(%edx,%eax)
31;	movl	$33, 10080(%edx,%eax)
32
33; Function Attrs: nounwind
34define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
35; CHECK-LABEL: testCombineMultiplies:
36; CHECK:       # %bb.0: # %entry
37; CHECK-NEXT:    pushl %esi
38; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
39; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
40; CHECK-NEXT:    imull $400, %ecx, %edx # imm = 0x190
41; CHECK-NEXT:    leal (%edx,%eax), %esi
42; CHECK-NEXT:    movl $11, 2020(%esi,%ecx,4)
43; CHECK-NEXT:    movl $22, 2080(%edx,%eax)
44; CHECK-NEXT:    movl $33, 10080(%edx,%eax)
45; CHECK-NEXT:    popl %esi
46; CHECK-NEXT:    retl
47entry:
48  %add = add nsw i32 %lll, 5
49  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
50  store i32 11, i32* %arrayidx1, align 4
51  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
52  store i32 22, i32* %arrayidx3, align 4
53  %add4 = add nsw i32 %lll, 25
54  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
55  store i32 33, i32* %arrayidx6, align 4
56  ret void
57}
58
59
60; Test for the same optimization on vector multiplies.
61;
62; Source looks something like this:
63;
64; typedef int v4int __attribute__((__vector_size__(16)));
65;
66; v4int x;
67; v4int v2, v3;
68; void testCombineMultiplies_splat(v4int v1) {
69;   v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
70;   v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
71;   x = (v1 + (v4int){ 11, 11, 11, 11 });
72; }
73;
74; Output looks something like this:
75;
76; testCombineMultiplies_splat:                              # @testCombineMultiplies_splat
77; # %bb.0:                                 # %entry
78; 	movdqa	.LCPI1_0, %xmm1         # xmm1 = [11,11,11,11]
79; 	paddd	%xmm0, %xmm1
80; 	movdqa	.LCPI1_1, %xmm2         # xmm2 = [22,22,22,22]
81; 	pshufd	$245, %xmm0, %xmm3      # xmm3 = xmm0[1,1,3,3]
82; 	pmuludq	%xmm2, %xmm0
83; 	pshufd	$232, %xmm0, %xmm0      # xmm0 = xmm0[0,2,2,3]
84; 	pmuludq	%xmm2, %xmm3
85; 	pshufd	$232, %xmm3, %xmm2      # xmm2 = xmm3[0,2,2,3]
86; 	punpckldq	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
87; 	movdqa	.LCPI1_2, %xmm2         # xmm2 = [242,242,242,242]
88;	paddd	%xmm0, %xmm2
89;	paddd	.LCPI1_3, %xmm0
90;	movdqa	%xmm2, v2
91;	movdqa	%xmm0, v3
92;	movdqa	%xmm1, x
93;	retl
94;
95; Again, we want to make sure we don't generate two different multiplies.
96; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
97; pmuludq instructions), followed by two adds. Without this optimization, we'd
98; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).
99
100@v2 = common global <4 x i32> zeroinitializer, align 16
101@v3 = common global <4 x i32> zeroinitializer, align 16
102@x = common global <4 x i32> zeroinitializer, align 16
103
104; Function Attrs: nounwind
105define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
106; CHECK-LABEL: testCombineMultiplies_splat:
107; CHECK:       # %bb.0: # %entry
108; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11]
109; CHECK-NEXT:    paddd %xmm0, %xmm1
110; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,22,22,22]
111; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
112; CHECK-NEXT:    pmuludq %xmm2, %xmm0
113; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
114; CHECK-NEXT:    pmuludq %xmm2, %xmm3
115; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
116; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
117; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,242,242,242]
118; CHECK-NEXT:    paddd %xmm0, %xmm2
119; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
120; CHECK-NEXT:    movdqa %xmm2, v2
121; CHECK-NEXT:    movdqa %xmm0, v3
122; CHECK-NEXT:    movdqa %xmm1, x
123; CHECK-NEXT:    retl
124entry:
125  %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
126  %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
127  %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
128  %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
129  store <4 x i32> %mul1, <4 x i32>* @v2, align 16
130  store <4 x i32> %mul2, <4 x i32>* @v3, align 16
131  store <4 x i32> %add1, <4 x i32>* @x, align 16
132  ret void
133}
134
135; Finally, check the non-splatted vector case. This is very similar
136; to the previous test case, except for the vector values.
137
138; Function Attrs: nounwind
139define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
140; CHECK-LABEL: testCombineMultiplies_non_splat:
141; CHECK:       # %bb.0: # %entry
142; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,22,33,44]
143; CHECK-NEXT:    paddd %xmm0, %xmm1
144; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,33,44,55]
145; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
146; CHECK-NEXT:    pmuludq %xmm2, %xmm0
147; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
148; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
149; CHECK-NEXT:    pmuludq %xmm3, %xmm2
150; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
151; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
152; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
153; CHECK-NEXT:    paddd %xmm0, %xmm2
154; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
155; CHECK-NEXT:    movdqa %xmm2, v2
156; CHECK-NEXT:    movdqa %xmm0, v3
157; CHECK-NEXT:    movdqa %xmm1, x
158; CHECK-NEXT:    retl
159entry:
160  %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
161  %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
162  %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
163  %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
164  store <4 x i32> %mul1, <4 x i32>* @v2, align 16
165  store <4 x i32> %mul2, <4 x i32>* @v3, align 16
166  store <4 x i32> %add1, <4 x i32>* @x, align 16
167  ret void
168}
169