1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW
4
5define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
6; SSE2-LABEL: avg_v4i8:
7; SSE2:       # BB#0:
8; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
9; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
10; SSE2-NEXT:    pavgb %xmm0, %xmm1
11; SSE2-NEXT:    movd %xmm1, (%rax)
12; SSE2-NEXT:    retq
13;
14; AVX2-LABEL: avg_v4i8:
15; AVX2:       # BB#0:
16; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
17; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
18; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
19; AVX2-NEXT:    vmovd %xmm0, (%rax)
20; AVX2-NEXT:    retq
21;
22; AVX512BW-LABEL: avg_v4i8:
23; AVX512BW:       # BB#0:
24; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
25; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
26; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
27; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
28; AVX512BW-NEXT:    retq
29  %1 = load <4 x i8>, <4 x i8>* %a
30  %2 = load <4 x i8>, <4 x i8>* %b
31  %3 = zext <4 x i8> %1 to <4 x i32>
32  %4 = zext <4 x i8> %2 to <4 x i32>
33  %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
34  %6 = add nuw nsw <4 x i32> %5, %4
35  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
36  %8 = trunc <4 x i32> %7 to <4 x i8>
37  store <4 x i8> %8, <4 x i8>* undef, align 4
38  ret void
39}
40
41define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
42; SSE2-LABEL: avg_v8i8:
43; SSE2:       # BB#0:
44; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
45; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
46; SSE2-NEXT:    pavgb %xmm0, %xmm1
47; SSE2-NEXT:    movq %xmm1, (%rax)
48; SSE2-NEXT:    retq
49;
50; AVX2-LABEL: avg_v8i8:
51; AVX2:       # BB#0:
52; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
53; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
54; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
55; AVX2-NEXT:    vmovq %xmm0, (%rax)
56; AVX2-NEXT:    retq
57;
58; AVX512BW-LABEL: avg_v8i8:
59; AVX512BW:       # BB#0:
60; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
61; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
62; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
63; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
64; AVX512BW-NEXT:    retq
65  %1 = load <8 x i8>, <8 x i8>* %a
66  %2 = load <8 x i8>, <8 x i8>* %b
67  %3 = zext <8 x i8> %1 to <8 x i32>
68  %4 = zext <8 x i8> %2 to <8 x i32>
69  %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
70  %6 = add nuw nsw <8 x i32> %5, %4
71  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
72  %8 = trunc <8 x i32> %7 to <8 x i8>
73  store <8 x i8> %8, <8 x i8>* undef, align 4
74  ret void
75}
76
77define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
78; SSE2-LABEL: avg_v16i8:
79; SSE2:       # BB#0:
80; SSE2-NEXT:    movdqa (%rsi), %xmm0
81; SSE2-NEXT:    pavgb (%rdi), %xmm0
82; SSE2-NEXT:    movdqu %xmm0, (%rax)
83; SSE2-NEXT:    retq
84;
85; AVX-LABEL: avg_v16i8:
86; AVX:       # BB#0:
87; AVX-NEXT:    vmovdqa (%rsi), %xmm0
88; AVX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
89; AVX-NEXT:    vmovdqu %xmm0, (%rax)
90; AVX-NEXT:    retq
91  %1 = load <16 x i8>, <16 x i8>* %a
92  %2 = load <16 x i8>, <16 x i8>* %b
93  %3 = zext <16 x i8> %1 to <16 x i32>
94  %4 = zext <16 x i8> %2 to <16 x i32>
95  %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
96  %6 = add nuw nsw <16 x i32> %5, %4
97  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
98  %8 = trunc <16 x i32> %7 to <16 x i8>
99  store <16 x i8> %8, <16 x i8>* undef, align 4
100  ret void
101}
102
103define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
104; AVX2-LABEL: avg_v32i8:
105; AVX2:       # BB#0:
106; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
107; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
108; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
109; AVX2-NEXT:    vzeroupper
110; AVX2-NEXT:    retq
111;
112; AVX512BW-LABEL: avg_v32i8:
113; AVX512BW:       # BB#0:
114; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
115; AVX512BW-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
116; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
117; AVX512BW-NEXT:    retq
118  %1 = load <32 x i8>, <32 x i8>* %a
119  %2 = load <32 x i8>, <32 x i8>* %b
120  %3 = zext <32 x i8> %1 to <32 x i32>
121  %4 = zext <32 x i8> %2 to <32 x i32>
122  %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
123  %6 = add nuw nsw <32 x i32> %5, %4
124  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
125  %8 = trunc <32 x i32> %7 to <32 x i8>
126  store <32 x i8> %8, <32 x i8>* undef, align 4
127  ret void
128}
129
130define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
131; AVX512BW-LABEL: avg_v64i8:
132; AVX512BW:       # BB#0:
133; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
134; AVX512BW-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
135; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
136; AVX512BW-NEXT:    retq
137  %1 = load <64 x i8>, <64 x i8>* %a
138  %2 = load <64 x i8>, <64 x i8>* %b
139  %3 = zext <64 x i8> %1 to <64 x i32>
140  %4 = zext <64 x i8> %2 to <64 x i32>
141  %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
142  %6 = add nuw nsw <64 x i32> %5, %4
143  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
144  %8 = trunc <64 x i32> %7 to <64 x i8>
145  store <64 x i8> %8, <64 x i8>* undef, align 4
146  ret void
147}
148
149define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
150; SSE2-LABEL: avg_v4i16:
151; SSE2:       # BB#0:
152; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
153; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
154; SSE2-NEXT:    pavgw %xmm0, %xmm1
155; SSE2-NEXT:    movq %xmm1, (%rax)
156; SSE2-NEXT:    retq
157;
158; AVX2-LABEL: avg_v4i16:
159; AVX2:       # BB#0:
160; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
161; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
162; AVX2-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
163; AVX2-NEXT:    vmovq %xmm0, (%rax)
164; AVX2-NEXT:    retq
165;
166; AVX512BW-LABEL: avg_v4i16:
167; AVX512BW:       # BB#0:
168; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
169; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
170; AVX512BW-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
171; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
172; AVX512BW-NEXT:    retq
173  %1 = load <4 x i16>, <4 x i16>* %a
174  %2 = load <4 x i16>, <4 x i16>* %b
175  %3 = zext <4 x i16> %1 to <4 x i32>
176  %4 = zext <4 x i16> %2 to <4 x i32>
177  %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
178  %6 = add nuw nsw <4 x i32> %5, %4
179  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
180  %8 = trunc <4 x i32> %7 to <4 x i16>
181  store <4 x i16> %8, <4 x i16>* undef, align 4
182  ret void
183}
184
185define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
186; SSE2-LABEL: avg_v8i16:
187; SSE2:       # BB#0:
188; SSE2-NEXT:    movdqa (%rsi), %xmm0
189; SSE2-NEXT:    pavgw (%rdi), %xmm0
190; SSE2-NEXT:    movdqu %xmm0, (%rax)
191; SSE2-NEXT:    retq
192;
193; AVX-LABEL: avg_v8i16:
194; AVX:       # BB#0:
195; AVX-NEXT:    vmovdqa (%rsi), %xmm0
196; AVX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
197; AVX-NEXT:    vmovdqu %xmm0, (%rax)
198; AVX-NEXT:    retq
199  %1 = load <8 x i16>, <8 x i16>* %a
200  %2 = load <8 x i16>, <8 x i16>* %b
201  %3 = zext <8 x i16> %1 to <8 x i32>
202  %4 = zext <8 x i16> %2 to <8 x i32>
203  %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
204  %6 = add nuw nsw <8 x i32> %5, %4
205  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
206  %8 = trunc <8 x i32> %7 to <8 x i16>
207  store <8 x i16> %8, <8 x i16>* undef, align 4
208  ret void
209}
210
211define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
212; AVX2-LABEL: avg_v16i16:
213; AVX2:       # BB#0:
214; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
215; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
216; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
217; AVX2-NEXT:    vzeroupper
218; AVX2-NEXT:    retq
219;
220; AVX512BW-LABEL: avg_v16i16:
221; AVX512BW:       # BB#0:
222; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
223; AVX512BW-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
224; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
225; AVX512BW-NEXT:    retq
226  %1 = load <16 x i16>, <16 x i16>* %a
227  %2 = load <16 x i16>, <16 x i16>* %b
228  %3 = zext <16 x i16> %1 to <16 x i32>
229  %4 = zext <16 x i16> %2 to <16 x i32>
230  %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
231  %6 = add nuw nsw <16 x i32> %5, %4
232  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
233  %8 = trunc <16 x i32> %7 to <16 x i16>
234  store <16 x i16> %8, <16 x i16>* undef, align 4
235  ret void
236}
237
238define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
239; AVX512BW-LABEL: avg_v32i16:
240; AVX512BW:       # BB#0:
241; AVX512BW-NEXT:    vmovdqu16 (%rsi), %zmm0
242; AVX512BW-NEXT:    vpavgw (%rdi), %zmm0, %zmm0
243; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
244; AVX512BW-NEXT:    retq
245  %1 = load <32 x i16>, <32 x i16>* %a
246  %2 = load <32 x i16>, <32 x i16>* %b
247  %3 = zext <32 x i16> %1 to <32 x i32>
248  %4 = zext <32 x i16> %2 to <32 x i32>
249  %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
250  %6 = add nuw nsw <32 x i32> %5, %4
251  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
252  %8 = trunc <32 x i32> %7 to <32 x i16>
253  store <32 x i16> %8, <32 x i16>* undef, align 4
254  ret void
255}
256
257define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
258; SSE2-LABEL: avg_v4i8_2:
259; SSE2:       # BB#0:
260; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
261; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
262; SSE2-NEXT:    pavgb %xmm0, %xmm1
263; SSE2-NEXT:    movd %xmm1, (%rax)
264; SSE2-NEXT:    retq
265;
266; AVX2-LABEL: avg_v4i8_2:
267; AVX2:       # BB#0:
268; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
269; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
270; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
271; AVX2-NEXT:    vmovd %xmm0, (%rax)
272; AVX2-NEXT:    retq
273;
274; AVX512BW-LABEL: avg_v4i8_2:
275; AVX512BW:       # BB#0:
276; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
277; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
278; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
279; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
280; AVX512BW-NEXT:    retq
281  %1 = load <4 x i8>, <4 x i8>* %a
282  %2 = load <4 x i8>, <4 x i8>* %b
283  %3 = zext <4 x i8> %1 to <4 x i32>
284  %4 = zext <4 x i8> %2 to <4 x i32>
285  %5 = add nuw nsw <4 x i32> %3, %4
286  %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
287  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
288  %8 = trunc <4 x i32> %7 to <4 x i8>
289  store <4 x i8> %8, <4 x i8>* undef, align 4
290  ret void
291}
292
293define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
294; SSE2-LABEL: avg_v8i8_2:
295; SSE2:       # BB#0:
296; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
297; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
298; SSE2-NEXT:    pavgb %xmm0, %xmm1
299; SSE2-NEXT:    movq %xmm1, (%rax)
300; SSE2-NEXT:    retq
301;
302; AVX2-LABEL: avg_v8i8_2:
303; AVX2:       # BB#0:
304; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
305; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
306; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
307; AVX2-NEXT:    vmovq %xmm0, (%rax)
308; AVX2-NEXT:    retq
309;
310; AVX512BW-LABEL: avg_v8i8_2:
311; AVX512BW:       # BB#0:
312; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
313; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
314; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
315; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
316; AVX512BW-NEXT:    retq
317  %1 = load <8 x i8>, <8 x i8>* %a
318  %2 = load <8 x i8>, <8 x i8>* %b
319  %3 = zext <8 x i8> %1 to <8 x i32>
320  %4 = zext <8 x i8> %2 to <8 x i32>
321  %5 = add nuw nsw <8 x i32> %3, %4
322  %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
323  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
324  %8 = trunc <8 x i32> %7 to <8 x i8>
325  store <8 x i8> %8, <8 x i8>* undef, align 4
326  ret void
327}
328
329define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
330; SSE2-LABEL: avg_v16i8_2:
331; SSE2:       # BB#0:
332; SSE2-NEXT:    movdqa (%rdi), %xmm0
333; SSE2-NEXT:    pavgb (%rsi), %xmm0
334; SSE2-NEXT:    movdqu %xmm0, (%rax)
335; SSE2-NEXT:    retq
336;
337; AVX-LABEL: avg_v16i8_2:
338; AVX:       # BB#0:
339; AVX-NEXT:    vmovdqa (%rdi), %xmm0
340; AVX-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
341; AVX-NEXT:    vmovdqu %xmm0, (%rax)
342; AVX-NEXT:    retq
343  %1 = load <16 x i8>, <16 x i8>* %a
344  %2 = load <16 x i8>, <16 x i8>* %b
345  %3 = zext <16 x i8> %1 to <16 x i32>
346  %4 = zext <16 x i8> %2 to <16 x i32>
347  %5 = add nuw nsw <16 x i32> %3, %4
348  %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
349  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
350  %8 = trunc <16 x i32> %7 to <16 x i8>
351  store <16 x i8> %8, <16 x i8>* undef, align 4
352  ret void
353}
354
355define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
356; AVX2-LABEL: avg_v32i8_2:
357; AVX2:       # BB#0:
358; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
359; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
360; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
361; AVX2-NEXT:    vzeroupper
362; AVX2-NEXT:    retq
363;
364; AVX512BW-LABEL: avg_v32i8_2:
365; AVX512BW:       # BB#0:
366; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
367; AVX512BW-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
368; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
369; AVX512BW-NEXT:    retq
370  %1 = load <32 x i8>, <32 x i8>* %a
371  %2 = load <32 x i8>, <32 x i8>* %b
372  %3 = zext <32 x i8> %1 to <32 x i32>
373  %4 = zext <32 x i8> %2 to <32 x i32>
374  %5 = add nuw nsw <32 x i32> %3, %4
375  %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
376  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
377  %8 = trunc <32 x i32> %7 to <32 x i8>
378  store <32 x i8> %8, <32 x i8>* undef, align 4
379  ret void
380}
381
382define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
383; AVX512BW-LABEL: avg_v64i8_2:
384; AVX512BW:       # BB#0:
385; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
386; AVX512BW-NEXT:    vpavgb %zmm0, %zmm0, %zmm0
387; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
388; AVX512BW-NEXT:    retq
389  %1 = load <64 x i8>, <64 x i8>* %a
390  %2 = load <64 x i8>, <64 x i8>* %b
391  %3 = zext <64 x i8> %1 to <64 x i32>
392  %4 = zext <64 x i8> %2 to <64 x i32>
393  %5 = add nuw nsw <64 x i32> %4, %4
394  %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
395  %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
396  %8 = trunc <64 x i32> %7 to <64 x i8>
397  store <64 x i8> %8, <64 x i8>* undef, align 4
398  ret void
399}
400
401
402define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
403; SSE2-LABEL: avg_v4i16_2:
404; SSE2:       # BB#0:
405; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
406; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
407; SSE2-NEXT:    pavgw %xmm0, %xmm1
408; SSE2-NEXT:    movq %xmm1, (%rax)
409; SSE2-NEXT:    retq
410;
411; AVX2-LABEL: avg_v4i16_2:
412; AVX2:       # BB#0:
413; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
414; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
415; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
416; AVX2-NEXT:    vmovq %xmm0, (%rax)
417; AVX2-NEXT:    retq
418;
419; AVX512BW-LABEL: avg_v4i16_2:
420; AVX512BW:       # BB#0:
421; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
422; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
423; AVX512BW-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
424; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
425; AVX512BW-NEXT:    retq
426  %1 = load <4 x i16>, <4 x i16>* %a
427  %2 = load <4 x i16>, <4 x i16>* %b
428  %3 = zext <4 x i16> %1 to <4 x i32>
429  %4 = zext <4 x i16> %2 to <4 x i32>
430  %5 = add nuw nsw <4 x i32> %3, %4
431  %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
432  %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
433  %8 = trunc <4 x i32> %7 to <4 x i16>
434  store <4 x i16> %8, <4 x i16>* undef, align 4
435  ret void
436}
437
438define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
439; SSE2-LABEL: avg_v8i16_2:
440; SSE2:       # BB#0:
441; SSE2-NEXT:    movdqa (%rdi), %xmm0
442; SSE2-NEXT:    pavgw (%rsi), %xmm0
443; SSE2-NEXT:    movdqu %xmm0, (%rax)
444; SSE2-NEXT:    retq
445;
446; AVX-LABEL: avg_v8i16_2:
447; AVX:       # BB#0:
448; AVX-NEXT:    vmovdqa (%rdi), %xmm0
449; AVX-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
450; AVX-NEXT:    vmovdqu %xmm0, (%rax)
451; AVX-NEXT:    retq
452  %1 = load <8 x i16>, <8 x i16>* %a
453  %2 = load <8 x i16>, <8 x i16>* %b
454  %3 = zext <8 x i16> %1 to <8 x i32>
455  %4 = zext <8 x i16> %2 to <8 x i32>
456  %5 = add nuw nsw <8 x i32> %3, %4
457  %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
458  %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
459  %8 = trunc <8 x i32> %7 to <8 x i16>
460  store <8 x i16> %8, <8 x i16>* undef, align 4
461  ret void
462}
463
464define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
465; AVX2-LABEL: avg_v16i16_2:
466; AVX2:       # BB#0:
467; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
468; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
469; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
470; AVX2-NEXT:    vzeroupper
471; AVX2-NEXT:    retq
472;
473; AVX512BW-LABEL: avg_v16i16_2:
474; AVX512BW:       # BB#0:
475; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
476; AVX512BW-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
477; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
478; AVX512BW-NEXT:    retq
479  %1 = load <16 x i16>, <16 x i16>* %a
480  %2 = load <16 x i16>, <16 x i16>* %b
481  %3 = zext <16 x i16> %1 to <16 x i32>
482  %4 = zext <16 x i16> %2 to <16 x i32>
483  %5 = add nuw nsw <16 x i32> %3, %4
484  %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
485  %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
486  %8 = trunc <16 x i32> %7 to <16 x i16>
487  store <16 x i16> %8, <16 x i16>* undef, align 4
488  ret void
489}
490
491define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
492; AVX512BW-LABEL: avg_v32i16_2:
493; AVX512BW:       # BB#0:
494; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
495; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
496; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
497; AVX512BW-NEXT:    retq
498  %1 = load <32 x i16>, <32 x i16>* %a
499  %2 = load <32 x i16>, <32 x i16>* %b
500  %3 = zext <32 x i16> %1 to <32 x i32>
501  %4 = zext <32 x i16> %2 to <32 x i32>
502  %5 = add nuw nsw <32 x i32> %3, %4
503  %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
504  %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
505  %8 = trunc <32 x i32> %7 to <32 x i16>
506  store <32 x i16> %8, <32 x i16>* undef, align 4
507  ret void
508}
509
510define void @avg_v4i8_const(<4 x i8>* %a) {
511; SSE2-LABEL: avg_v4i8_const:
512; SSE2:       # BB#0:
513; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
514; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
515; SSE2-NEXT:    movd %xmm0, (%rax)
516; SSE2-NEXT:    retq
517;
518; AVX2-LABEL: avg_v4i8_const:
519; AVX2:       # BB#0:
520; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
521; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
522; AVX2-NEXT:    vmovd %xmm0, (%rax)
523; AVX2-NEXT:    retq
524;
525; AVX512BW-LABEL: avg_v4i8_const:
526; AVX512BW:       # BB#0:
527; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
528; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
529; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
530; AVX512BW-NEXT:    retq
531  %1 = load <4 x i8>, <4 x i8>* %a
532  %2 = zext <4 x i8> %1 to <4 x i32>
533  %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
534  %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
535  %5 = trunc <4 x i32> %4 to <4 x i8>
536  store <4 x i8> %5, <4 x i8>* undef, align 4
537  ret void
538}
539
540define void @avg_v8i8_const(<8 x i8>* %a) {
541; SSE2-LABEL: avg_v8i8_const:
542; SSE2:       # BB#0:
543; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
544; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
545; SSE2-NEXT:    movq %xmm0, (%rax)
546; SSE2-NEXT:    retq
547;
548; AVX2-LABEL: avg_v8i8_const:
549; AVX2:       # BB#0:
550; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
551; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
552; AVX2-NEXT:    vmovq %xmm0, (%rax)
553; AVX2-NEXT:    retq
554;
555; AVX512BW-LABEL: avg_v8i8_const:
556; AVX512BW:       # BB#0:
557; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
558; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
559; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
560; AVX512BW-NEXT:    retq
561  %1 = load <8 x i8>, <8 x i8>* %a
562  %2 = zext <8 x i8> %1 to <8 x i32>
563  %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
564  %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
565  %5 = trunc <8 x i32> %4 to <8 x i8>
566  store <8 x i8> %5, <8 x i8>* undef, align 4
567  ret void
568}
569
570define void @avg_v16i8_const(<16 x i8>* %a) {
571; SSE2-LABEL: avg_v16i8_const:
572; SSE2:       # BB#0:
573; SSE2-NEXT:    movdqa (%rdi), %xmm0
574; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
575; SSE2-NEXT:    movdqu %xmm0, (%rax)
576; SSE2-NEXT:    retq
577;
578; AVX-LABEL: avg_v16i8_const:
579; AVX:       # BB#0:
580; AVX-NEXT:    vmovdqa (%rdi), %xmm0
581; AVX-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
582; AVX-NEXT:    vmovdqu %xmm0, (%rax)
583; AVX-NEXT:    retq
584  %1 = load <16 x i8>, <16 x i8>* %a
585  %2 = zext <16 x i8> %1 to <16 x i32>
586  %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
587  %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
588  %5 = trunc <16 x i32> %4 to <16 x i8>
589  store <16 x i8> %5, <16 x i8>* undef, align 4
590  ret void
591}
592
593define void @avg_v32i8_const(<32 x i8>* %a) {
594; AVX2-LABEL: avg_v32i8_const:
595; AVX2:       # BB#0:
596; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
597; AVX2-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
598; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
599; AVX2-NEXT:    vzeroupper
600; AVX2-NEXT:    retq
601;
602; AVX512BW-LABEL: avg_v32i8_const:
603; AVX512BW:       # BB#0:
604; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
605; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
606; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
607; AVX512BW-NEXT:    retq
608  %1 = load <32 x i8>, <32 x i8>* %a
609  %2 = zext <32 x i8> %1 to <32 x i32>
610  %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
611  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
612  %5 = trunc <32 x i32> %4 to <32 x i8>
613  store <32 x i8> %5, <32 x i8>* undef, align 4
614  ret void
615}
616
617define void @avg_v64i8_const(<64 x i8>* %a) {
618; AVX512BW-LABEL: avg_v64i8_const:
619; AVX512BW:       # BB#0:
620; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
621; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %zmm0, %zmm0
622; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
623; AVX512BW-NEXT:    retq
624  %1 = load <64 x i8>, <64 x i8>* %a
625  %2 = zext <64 x i8> %1 to <64 x i32>
626  %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
627  %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
628  %5 = trunc <64 x i32> %4 to <64 x i8>
629  store <64 x i8> %5, <64 x i8>* undef, align 4
630  ret void
631}
632
633define void @avg_v4i16_const(<4 x i16>* %a) {
634; SSE2-LABEL: avg_v4i16_const:
635; SSE2:       # BB#0:
636; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
637; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
638; SSE2-NEXT:    movq %xmm0, (%rax)
639; SSE2-NEXT:    retq
640;
641; AVX2-LABEL: avg_v4i16_const:
642; AVX2:       # BB#0:
643; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
644; AVX2-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
645; AVX2-NEXT:    vmovq %xmm0, (%rax)
646; AVX2-NEXT:    retq
647;
648; AVX512BW-LABEL: avg_v4i16_const:
649; AVX512BW:       # BB#0:
650; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
651; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
652; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
653; AVX512BW-NEXT:    retq
654  %1 = load <4 x i16>, <4 x i16>* %a
655  %2 = zext <4 x i16> %1 to <4 x i32>
656  %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
657  %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
658  %5 = trunc <4 x i32> %4 to <4 x i16>
659  store <4 x i16> %5, <4 x i16>* undef, align 4
660  ret void
661}
662
663define void @avg_v8i16_const(<8 x i16>* %a) {
664; SSE2-LABEL: avg_v8i16_const:
665; SSE2:       # BB#0:
666; SSE2-NEXT:    movdqa (%rdi), %xmm0
667; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
668; SSE2-NEXT:    movdqu %xmm0, (%rax)
669; SSE2-NEXT:    retq
670;
671; AVX-LABEL: avg_v8i16_const:
672; AVX:       # BB#0:
673; AVX-NEXT:    vmovdqa (%rdi), %xmm0
674; AVX-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
675; AVX-NEXT:    vmovdqu %xmm0, (%rax)
676; AVX-NEXT:    retq
677  %1 = load <8 x i16>, <8 x i16>* %a
678  %2 = zext <8 x i16> %1 to <8 x i32>
679  %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
680  %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
681  %5 = trunc <8 x i32> %4 to <8 x i16>
682  store <8 x i16> %5, <8 x i16>* undef, align 4
683  ret void
684}
685
686define void @avg_v16i16_const(<16 x i16>* %a) {
687; AVX2-LABEL: avg_v16i16_const:
688; AVX2:       # BB#0:
689; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
690; AVX2-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
691; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
692; AVX2-NEXT:    vzeroupper
693; AVX2-NEXT:    retq
694;
695; AVX512BW-LABEL: avg_v16i16_const:
696; AVX512BW:       # BB#0:
697; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
698; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
699; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
700; AVX512BW-NEXT:    retq
701  %1 = load <16 x i16>, <16 x i16>* %a
702  %2 = zext <16 x i16> %1 to <16 x i32>
703  %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
704  %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
705  %5 = trunc <16 x i32> %4 to <16 x i16>
706  store <16 x i16> %5, <16 x i16>* undef, align 4
707  ret void
708}
709
710define void @avg_v32i16_const(<32 x i16>* %a) {
711; AVX512BW-LABEL: avg_v32i16_const:
712; AVX512BW:       # BB#0:
713; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
714; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %zmm0, %zmm0
715; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
716; AVX512BW-NEXT:    retq
717  %1 = load <32 x i16>, <32 x i16>* %a
718  %2 = zext <32 x i16> %1 to <32 x i32>
719  %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
720  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
721  %5 = trunc <32 x i32> %4 to <32 x i16>
722  store <32 x i16> %5, <32 x i16>* undef, align 4
723  ret void
724}
725