1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s
3
4; Test based on pr5626 to load/store
5;
6
7%i32vec3 = type <3 x i32>
8define void @add3i32(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
9; CHECK-LABEL: add3i32:
10; CHECK:       # BB#0:
11; CHECK-NEXT:    movdqa (%rsi), %xmm0
12; CHECK-NEXT:    paddd (%rdx), %xmm0
13; CHECK-NEXT:    pextrd $2, %xmm0, 8(%rdi)
14; CHECK-NEXT:    movq %xmm0, (%rdi)
15; CHECK-NEXT:    movq %rdi, %rax
16; CHECK-NEXT:    retq
17	%a = load %i32vec3, %i32vec3* %ap, align 16
18	%b = load %i32vec3, %i32vec3* %bp, align 16
19	%x = add %i32vec3 %a, %b
20	store %i32vec3 %x, %i32vec3* %ret, align 16
21	ret void
22}
23
24define void @add3i32_2(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
25; CHECK-LABEL: add3i32_2:
26; CHECK:       # BB#0:
27; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
28; CHECK-NEXT:    pinsrd $2, 8(%rsi), %xmm0
29; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
30; CHECK-NEXT:    pinsrd $2, 8(%rdx), %xmm1
31; CHECK-NEXT:    paddd %xmm0, %xmm1
32; CHECK-NEXT:    pextrd $2, %xmm1, 8(%rdi)
33; CHECK-NEXT:    movq %xmm1, (%rdi)
34; CHECK-NEXT:    movq %rdi, %rax
35; CHECK-NEXT:    retq
36	%a = load %i32vec3, %i32vec3* %ap, align 8
37	%b = load %i32vec3, %i32vec3* %bp, align 8
38	%x = add %i32vec3 %a, %b
39	store %i32vec3 %x, %i32vec3* %ret, align 8
40	ret void
41}
42
43%i32vec7 = type <7 x i32>
44define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
45; CHECK-LABEL: add7i32:
46; CHECK:       # BB#0:
47; CHECK-NEXT:    movdqa (%rsi), %xmm0
48; CHECK-NEXT:    movdqa 16(%rsi), %xmm1
49; CHECK-NEXT:    paddd (%rdx), %xmm0
50; CHECK-NEXT:    paddd 16(%rdx), %xmm1
51; CHECK-NEXT:    pextrd $2, %xmm1, 24(%rdi)
52; CHECK-NEXT:    movq %xmm1, 16(%rdi)
53; CHECK-NEXT:    movdqa %xmm0, (%rdi)
54; CHECK-NEXT:    movq %rdi, %rax
55; CHECK-NEXT:    retq
56	%a = load %i32vec7, %i32vec7* %ap, align 16
57	%b = load %i32vec7, %i32vec7* %bp, align 16
58	%x = add %i32vec7 %a, %b
59	store %i32vec7 %x, %i32vec7* %ret, align 16
60	ret void
61}
62
63%i32vec12 = type <12 x i32>
64define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
65; CHECK-LABEL: add12i32:
66; CHECK:       # BB#0:
67; CHECK-NEXT:    movdqa (%rsi), %xmm0
68; CHECK-NEXT:    movdqa 16(%rsi), %xmm1
69; CHECK-NEXT:    movdqa 32(%rsi), %xmm2
70; CHECK-NEXT:    paddd (%rdx), %xmm0
71; CHECK-NEXT:    paddd 16(%rdx), %xmm1
72; CHECK-NEXT:    paddd 32(%rdx), %xmm2
73; CHECK-NEXT:    movdqa %xmm2, 32(%rdi)
74; CHECK-NEXT:    movdqa %xmm1, 16(%rdi)
75; CHECK-NEXT:    movdqa %xmm0, (%rdi)
76; CHECK-NEXT:    movq %rdi, %rax
77; CHECK-NEXT:    retq
78	%a = load %i32vec12, %i32vec12* %ap, align 16
79	%b = load %i32vec12, %i32vec12* %bp, align 16
80	%x = add %i32vec12 %a, %b
81	store %i32vec12 %x, %i32vec12* %ret, align 16
82	ret void
83}
84
85
86%i16vec3 = type <3 x i16>
87define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
88; CHECK-LABEL: add3i16:
89; CHECK:       # BB#0:
90; CHECK-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
91; CHECK-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
92; CHECK-NEXT:    paddd %xmm0, %xmm1
93; CHECK-NEXT:    pextrw $4, %xmm1, 4(%rdi)
94; CHECK-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
95; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
96; CHECK-NEXT:    movd %xmm0, (%rdi)
97; CHECK-NEXT:    movq %rdi, %rax
98; CHECK-NEXT:    retq
99	%a = load %i16vec3, %i16vec3* %ap, align 16
100	%b = load %i16vec3, %i16vec3* %bp, align 16
101	%x = add %i16vec3 %a, %b
102	store %i16vec3 %x, %i16vec3* %ret, align 16
103	ret void
104}
105
106%i16vec4 = type <4 x i16>
107define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
108; CHECK-LABEL: add4i16:
109; CHECK:       # BB#0:
110; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
111; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
112; CHECK-NEXT:    paddw %xmm0, %xmm1
113; CHECK-NEXT:    movq %xmm1, (%rdi)
114; CHECK-NEXT:    movq %rdi, %rax
115; CHECK-NEXT:    retq
116	%a = load %i16vec4, %i16vec4* %ap, align 16
117	%b = load %i16vec4, %i16vec4* %bp, align 16
118	%x = add %i16vec4 %a, %b
119	store %i16vec4 %x, %i16vec4* %ret, align 16
120	ret void
121}
122
123%i16vec12 = type <12 x i16>
124define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
125; CHECK-LABEL: add12i16:
126; CHECK:       # BB#0:
127; CHECK-NEXT:    movdqa (%rsi), %xmm0
128; CHECK-NEXT:    movdqa 16(%rsi), %xmm1
129; CHECK-NEXT:    paddw (%rdx), %xmm0
130; CHECK-NEXT:    paddw 16(%rdx), %xmm1
131; CHECK-NEXT:    movq %xmm1, 16(%rdi)
132; CHECK-NEXT:    movdqa %xmm0, (%rdi)
133; CHECK-NEXT:    movq %rdi, %rax
134; CHECK-NEXT:    retq
135	%a = load %i16vec12, %i16vec12* %ap, align 16
136	%b = load %i16vec12, %i16vec12* %bp, align 16
137	%x = add %i16vec12 %a, %b
138	store %i16vec12 %x, %i16vec12* %ret, align 16
139	ret void
140}
141
142%i16vec18 = type <18 x i16>
143define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
144; CHECK-LABEL: add18i16:
145; CHECK:       # BB#0:
146; CHECK-NEXT:    movdqa (%rsi), %xmm0
147; CHECK-NEXT:    movdqa 16(%rsi), %xmm1
148; CHECK-NEXT:    movdqa 32(%rsi), %xmm2
149; CHECK-NEXT:    paddw (%rdx), %xmm0
150; CHECK-NEXT:    paddw 16(%rdx), %xmm1
151; CHECK-NEXT:    paddw 32(%rdx), %xmm2
152; CHECK-NEXT:    movd %xmm2, 32(%rdi)
153; CHECK-NEXT:    movdqa %xmm1, 16(%rdi)
154; CHECK-NEXT:    movdqa %xmm0, (%rdi)
155; CHECK-NEXT:    movq %rdi, %rax
156; CHECK-NEXT:    retq
157	%a = load %i16vec18, %i16vec18* %ap, align 16
158	%b = load %i16vec18, %i16vec18* %bp, align 16
159	%x = add %i16vec18 %a, %b
160	store %i16vec18 %x, %i16vec18* %ret, align 16
161	ret void
162}
163
164
165%i8vec3 = type <3 x i8>
166define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
167; CHECK-LABEL: add3i8:
168; CHECK:       # BB#0:
169; CHECK-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
170; CHECK-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
171; CHECK-NEXT:    paddd %xmm0, %xmm1
172; CHECK-NEXT:    pextrb $8, %xmm1, 2(%rdi)
173; CHECK-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
174; CHECK-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
175; CHECK-NEXT:    movd %xmm0, %eax
176; CHECK-NEXT:    movw %ax, (%rdi)
177; CHECK-NEXT:    movq %rdi, %rax
178; CHECK-NEXT:    retq
179	%a = load %i8vec3, %i8vec3* %ap, align 16
180	%b = load %i8vec3, %i8vec3* %bp, align 16
181	%x = add %i8vec3 %a, %b
182	store %i8vec3 %x, %i8vec3* %ret, align 16
183	ret void
184}
185
186%i8vec31 = type <31 x i8>
187define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
188; CHECK-LABEL: add31i8:
189; CHECK:       # BB#0:
190; CHECK-NEXT:    movdqa (%rsi), %xmm0
191; CHECK-NEXT:    movdqa 16(%rsi), %xmm1
192; CHECK-NEXT:    paddb (%rdx), %xmm0
193; CHECK-NEXT:    paddb 16(%rdx), %xmm1
194; CHECK-NEXT:    pextrb $14, %xmm1, 30(%rdi)
195; CHECK-NEXT:    pextrw $6, %xmm1, 28(%rdi)
196; CHECK-NEXT:    pextrd $2, %xmm1, 24(%rdi)
197; CHECK-NEXT:    movq %xmm1, 16(%rdi)
198; CHECK-NEXT:    movdqa %xmm0, (%rdi)
199; CHECK-NEXT:    movq %rdi, %rax
200; CHECK-NEXT:    retq
201	%a = load %i8vec31, %i8vec31* %ap, align 16
202	%b = load %i8vec31, %i8vec31* %bp, align 16
203	%x = add %i8vec31 %a, %b
204	store %i8vec31 %x, %i8vec31* %ret, align 16
205	ret void
206}
207
208
209%i8vec3pack = type { <3 x i8>, i8 }
210define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
211; CHECK-LABEL: rot:
212; CHECK:       # BB#0: # %entry
213; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
214; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <158,158,158,u>
215; CHECK-NEXT:    pshufb %xmm0, %xmm1
216; CHECK-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
217; CHECK-NEXT:    movd %xmm1, %eax
218; CHECK-NEXT:    movw %ax, (%rsi)
219; CHECK-NEXT:    movb $-98, 2(%rsi)
220; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <1,1,1,u>
221; CHECK-NEXT:    pshufb %xmm0, %xmm1
222; CHECK-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
223; CHECK-NEXT:    movd %xmm0, %eax
224; CHECK-NEXT:    movw %ax, (%rdx)
225; CHECK-NEXT:    movb $1, 2(%rdx)
226; CHECK-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
227; CHECK-NEXT:    movdqa %xmm0, %xmm1
228; CHECK-NEXT:    psrld $1, %xmm1
229; CHECK-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
230; CHECK-NEXT:    pextrb $8, %xmm1, 2(%rdi)
231; CHECK-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
232; CHECK-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
233; CHECK-NEXT:    movd %xmm0, %eax
234; CHECK-NEXT:    movw %ax, (%rdi)
235; CHECK-NEXT:    movq %rdi, %rax
236; CHECK-NEXT:    retq
237entry:
238  %storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
239  store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
240  %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>*
241  store <3 x i8> <i8 1, i8 1, i8 1>, <3 x i8>* %storetmp1
242  %tmp = load %i8vec3pack, %i8vec3pack* %X
243  %extractVec = extractvalue %i8vec3pack %tmp, 0
244  %tmp2 = load %i8vec3pack, %i8vec3pack* %rot
245  %extractVec3 = extractvalue %i8vec3pack %tmp2, 0
246  %shr = lshr <3 x i8> %extractVec, %extractVec3
247  %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>*
248  store <3 x i8> %shr, <3 x i8>* %storetmp4
249  ret void
250}
251
252