1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW 4 5define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) { 6; SSE2-LABEL: avg_v4i8: 7; SSE2: # BB#0: 8; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 9; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 10; SSE2-NEXT: pavgb %xmm0, %xmm1 11; SSE2-NEXT: movd %xmm1, (%rax) 12; SSE2-NEXT: retq 13; 14; AVX2-LABEL: avg_v4i8: 15; AVX2: # BB#0: 16; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 17; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 18; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 19; AVX2-NEXT: vmovd %xmm0, (%rax) 20; AVX2-NEXT: retq 21; 22; AVX512BW-LABEL: avg_v4i8: 23; AVX512BW: # BB#0: 24; AVX512BW-NEXT: vmovd (%rdi), %xmm0 25; AVX512BW-NEXT: vmovd (%rsi), %xmm1 26; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 27; AVX512BW-NEXT: vmovd %xmm0, (%rax) 28; AVX512BW-NEXT: retq 29 %1 = load <4 x i8>, <4 x i8>* %a 30 %2 = load <4 x i8>, <4 x i8>* %b 31 %3 = zext <4 x i8> %1 to <4 x i32> 32 %4 = zext <4 x i8> %2 to <4 x i32> 33 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 34 %6 = add nuw nsw <4 x i32> %5, %4 35 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 36 %8 = trunc <4 x i32> %7 to <4 x i8> 37 store <4 x i8> %8, <4 x i8>* undef, align 4 38 ret void 39} 40 41define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) { 42; SSE2-LABEL: avg_v8i8: 43; SSE2: # BB#0: 44; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 45; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 46; SSE2-NEXT: pavgb %xmm0, %xmm1 47; SSE2-NEXT: movq %xmm1, (%rax) 48; SSE2-NEXT: retq 49; 50; AVX2-LABEL: avg_v8i8: 51; AVX2: # BB#0: 52; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 53; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 54; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 55; AVX2-NEXT: vmovq %xmm0, (%rax) 56; AVX2-NEXT: retq 57; 58; AVX512BW-LABEL: avg_v8i8: 59; AVX512BW: # BB#0: 60; AVX512BW-NEXT: vmovq (%rdi), %xmm0 61; AVX512BW-NEXT: vmovq (%rsi), %xmm1 62; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 63; AVX512BW-NEXT: vmovq %xmm0, (%rax) 64; AVX512BW-NEXT: retq 65 %1 = load <8 x i8>, <8 x i8>* %a 66 %2 = load <8 x i8>, <8 x i8>* %b 67 %3 = zext <8 x i8> %1 to <8 x i32> 68 %4 = zext <8 x i8> %2 to <8 x i32> 69 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 70 %6 = add nuw nsw <8 x i32> %5, %4 71 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 72 %8 = trunc <8 x i32> %7 to <8 x i8> 73 store <8 x i8> %8, <8 x i8>* undef, align 4 74 ret void 75} 76 77define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { 78; SSE2-LABEL: avg_v16i8: 79; SSE2: # BB#0: 80; SSE2-NEXT: movdqa (%rsi), %xmm0 81; SSE2-NEXT: pavgb (%rdi), %xmm0 82; SSE2-NEXT: movdqu %xmm0, (%rax) 83; SSE2-NEXT: retq 84; 85; AVX-LABEL: avg_v16i8: 86; AVX: # BB#0: 87; AVX-NEXT: vmovdqa (%rsi), %xmm0 88; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0 89; AVX-NEXT: vmovdqu %xmm0, (%rax) 90; AVX-NEXT: retq 91 %1 = load <16 x i8>, <16 x i8>* %a 92 %2 = load <16 x i8>, <16 x i8>* %b 93 %3 = zext <16 x i8> %1 to <16 x i32> 94 %4 = zext <16 x i8> %2 to <16 x i32> 95 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 96 %6 = add nuw nsw <16 x i32> %5, %4 97 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 98 %8 = trunc <16 x i32> %7 to <16 x i8> 99 store <16 x i8> %8, <16 x i8>* undef, align 4 100 ret void 101} 102 103define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { 104; AVX2-LABEL: avg_v32i8: 105; AVX2: # BB#0: 106; AVX2-NEXT: vmovdqa (%rsi), %ymm0 107; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 108; AVX2-NEXT: vmovdqu %ymm0, (%rax) 109; AVX2-NEXT: vzeroupper 110; AVX2-NEXT: retq 111; 112; AVX512BW-LABEL: avg_v32i8: 113; AVX512BW: # BB#0: 114; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 115; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0 116; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) 117; AVX512BW-NEXT: retq 118 %1 = load <32 x i8>, <32 x i8>* %a 119 %2 = load <32 x i8>, <32 x i8>* %b 120 %3 = zext <32 x i8> %1 to <32 x i32> 121 %4 = zext <32 x i8> %2 to <32 x i32> 122 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 123 %6 = add nuw nsw <32 x i32> %5, %4 124 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 125 %8 = trunc <32 x i32> %7 to <32 x i8> 126 store <32 x i8> %8, <32 x i8>* undef, align 4 127 ret void 128} 129 130define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { 131; AVX512BW-LABEL: avg_v64i8: 132; AVX512BW: # BB#0: 133; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 134; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 135; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) 136; AVX512BW-NEXT: retq 137 %1 = load <64 x i8>, <64 x i8>* %a 138 %2 = load <64 x i8>, <64 x i8>* %b 139 %3 = zext <64 x i8> %1 to <64 x i32> 140 %4 = zext <64 x i8> %2 to <64 x i32> 141 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 142 %6 = add nuw nsw <64 x i32> %5, %4 143 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 144 %8 = trunc <64 x i32> %7 to <64 x i8> 145 store <64 x i8> %8, <64 x i8>* undef, align 4 146 ret void 147} 148 149define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) { 150; SSE2-LABEL: avg_v4i16: 151; SSE2: # BB#0: 152; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 153; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 154; SSE2-NEXT: pavgw %xmm0, %xmm1 155; SSE2-NEXT: movq %xmm1, (%rax) 156; SSE2-NEXT: retq 157; 158; AVX2-LABEL: avg_v4i16: 159; AVX2: # BB#0: 160; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 161; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 162; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 163; AVX2-NEXT: vmovq %xmm0, (%rax) 164; AVX2-NEXT: retq 165; 166; AVX512BW-LABEL: avg_v4i16: 167; AVX512BW: # BB#0: 168; AVX512BW-NEXT: vmovq (%rdi), %xmm0 169; AVX512BW-NEXT: vmovq (%rsi), %xmm1 170; AVX512BW-NEXT: vpavgw %xmm0, %xmm1, %xmm0 171; AVX512BW-NEXT: vmovq %xmm0, (%rax) 172; AVX512BW-NEXT: retq 173 %1 = load <4 x i16>, <4 x i16>* %a 174 %2 = load <4 x i16>, <4 x i16>* %b 175 %3 = zext <4 x i16> %1 to <4 x i32> 176 %4 = zext <4 x i16> %2 to <4 x i32> 177 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 178 %6 = add nuw nsw <4 x i32> %5, %4 179 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 180 %8 = trunc <4 x i32> %7 to <4 x i16> 181 store <4 x i16> %8, <4 x i16>* undef, align 4 182 ret void 183} 184 185define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { 186; SSE2-LABEL: avg_v8i16: 187; SSE2: # BB#0: 188; SSE2-NEXT: movdqa (%rsi), %xmm0 189; SSE2-NEXT: pavgw (%rdi), %xmm0 190; SSE2-NEXT: movdqu %xmm0, (%rax) 191; SSE2-NEXT: retq 192; 193; AVX-LABEL: avg_v8i16: 194; AVX: # BB#0: 195; AVX-NEXT: vmovdqa (%rsi), %xmm0 196; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0 197; AVX-NEXT: vmovdqu %xmm0, (%rax) 198; AVX-NEXT: retq 199 %1 = load <8 x i16>, <8 x i16>* %a 200 %2 = load <8 x i16>, <8 x i16>* %b 201 %3 = zext <8 x i16> %1 to <8 x i32> 202 %4 = zext <8 x i16> %2 to <8 x i32> 203 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 204 %6 = add nuw nsw <8 x i32> %5, %4 205 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 206 %8 = trunc <8 x i32> %7 to <8 x i16> 207 store <8 x i16> %8, <8 x i16>* undef, align 4 208 ret void 209} 210 211define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { 212; AVX2-LABEL: avg_v16i16: 213; AVX2: # BB#0: 214; AVX2-NEXT: vmovdqa (%rsi), %ymm0 215; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 216; AVX2-NEXT: vmovdqu %ymm0, (%rax) 217; AVX2-NEXT: vzeroupper 218; AVX2-NEXT: retq 219; 220; AVX512BW-LABEL: avg_v16i16: 221; AVX512BW: # BB#0: 222; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 223; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0 224; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) 225; AVX512BW-NEXT: retq 226 %1 = load <16 x i16>, <16 x i16>* %a 227 %2 = load <16 x i16>, <16 x i16>* %b 228 %3 = zext <16 x i16> %1 to <16 x i32> 229 %4 = zext <16 x i16> %2 to <16 x i32> 230 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 231 %6 = add nuw nsw <16 x i32> %5, %4 232 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 233 %8 = trunc <16 x i32> %7 to <16 x i16> 234 store <16 x i16> %8, <16 x i16>* undef, align 4 235 ret void 236} 237 238define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { 239; AVX512BW-LABEL: avg_v32i16: 240; AVX512BW: # BB#0: 241; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 242; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 243; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) 244; AVX512BW-NEXT: retq 245 %1 = load <32 x i16>, <32 x i16>* %a 246 %2 = load <32 x i16>, <32 x i16>* %b 247 %3 = zext <32 x i16> %1 to <32 x i32> 248 %4 = zext <32 x i16> %2 to <32 x i32> 249 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 250 %6 = add nuw nsw <32 x i32> %5, %4 251 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 252 %8 = trunc <32 x i32> %7 to <32 x i16> 253 store <32 x i16> %8, <32 x i16>* undef, align 4 254 ret void 255} 256 257define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) { 258; SSE2-LABEL: avg_v4i8_2: 259; SSE2: # BB#0: 260; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 261; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 262; SSE2-NEXT: pavgb %xmm0, %xmm1 263; SSE2-NEXT: movd %xmm1, (%rax) 264; SSE2-NEXT: retq 265; 266; AVX2-LABEL: avg_v4i8_2: 267; AVX2: # BB#0: 268; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 269; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 270; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 271; AVX2-NEXT: vmovd %xmm0, (%rax) 272; AVX2-NEXT: retq 273; 274; AVX512BW-LABEL: avg_v4i8_2: 275; AVX512BW: # BB#0: 276; AVX512BW-NEXT: vmovd (%rdi), %xmm0 277; AVX512BW-NEXT: vmovd (%rsi), %xmm1 278; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 279; AVX512BW-NEXT: vmovd %xmm0, (%rax) 280; AVX512BW-NEXT: retq 281 %1 = load <4 x i8>, <4 x i8>* %a 282 %2 = load <4 x i8>, <4 x i8>* %b 283 %3 = zext <4 x i8> %1 to <4 x i32> 284 %4 = zext <4 x i8> %2 to <4 x i32> 285 %5 = add nuw nsw <4 x i32> %3, %4 286 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1> 287 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 288 %8 = trunc <4 x i32> %7 to <4 x i8> 289 store <4 x i8> %8, <4 x i8>* undef, align 4 290 ret void 291} 292 293define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) { 294; SSE2-LABEL: avg_v8i8_2: 295; SSE2: # BB#0: 296; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 297; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 298; SSE2-NEXT: pavgb %xmm0, %xmm1 299; SSE2-NEXT: movq %xmm1, (%rax) 300; SSE2-NEXT: retq 301; 302; AVX2-LABEL: avg_v8i8_2: 303; AVX2: # BB#0: 304; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 305; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 306; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 307; AVX2-NEXT: vmovq %xmm0, (%rax) 308; AVX2-NEXT: retq 309; 310; AVX512BW-LABEL: avg_v8i8_2: 311; AVX512BW: # BB#0: 312; AVX512BW-NEXT: vmovq (%rdi), %xmm0 313; AVX512BW-NEXT: vmovq (%rsi), %xmm1 314; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 315; AVX512BW-NEXT: vmovq %xmm0, (%rax) 316; AVX512BW-NEXT: retq 317 %1 = load <8 x i8>, <8 x i8>* %a 318 %2 = load <8 x i8>, <8 x i8>* %b 319 %3 = zext <8 x i8> %1 to <8 x i32> 320 %4 = zext <8 x i8> %2 to <8 x i32> 321 %5 = add nuw nsw <8 x i32> %3, %4 322 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 323 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 324 %8 = trunc <8 x i32> %7 to <8 x i8> 325 store <8 x i8> %8, <8 x i8>* undef, align 4 326 ret void 327} 328 329define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { 330; SSE2-LABEL: avg_v16i8_2: 331; SSE2: # BB#0: 332; SSE2-NEXT: movdqa (%rdi), %xmm0 333; SSE2-NEXT: pavgb (%rsi), %xmm0 334; SSE2-NEXT: movdqu %xmm0, (%rax) 335; SSE2-NEXT: retq 336; 337; AVX-LABEL: avg_v16i8_2: 338; AVX: # BB#0: 339; AVX-NEXT: vmovdqa (%rdi), %xmm0 340; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 341; AVX-NEXT: vmovdqu %xmm0, (%rax) 342; AVX-NEXT: retq 343 %1 = load <16 x i8>, <16 x i8>* %a 344 %2 = load <16 x i8>, <16 x i8>* %b 345 %3 = zext <16 x i8> %1 to <16 x i32> 346 %4 = zext <16 x i8> %2 to <16 x i32> 347 %5 = add nuw nsw <16 x i32> %3, %4 348 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 349 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 350 %8 = trunc <16 x i32> %7 to <16 x i8> 351 store <16 x i8> %8, <16 x i8>* undef, align 4 352 ret void 353} 354 355define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { 356; AVX2-LABEL: avg_v32i8_2: 357; AVX2: # BB#0: 358; AVX2-NEXT: vmovdqa (%rdi), %ymm0 359; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 360; AVX2-NEXT: vmovdqu %ymm0, (%rax) 361; AVX2-NEXT: vzeroupper 362; AVX2-NEXT: retq 363; 364; AVX512BW-LABEL: avg_v32i8_2: 365; AVX512BW: # BB#0: 366; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 367; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0 368; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) 369; AVX512BW-NEXT: retq 370 %1 = load <32 x i8>, <32 x i8>* %a 371 %2 = load <32 x i8>, <32 x i8>* %b 372 %3 = zext <32 x i8> %1 to <32 x i32> 373 %4 = zext <32 x i8> %2 to <32 x i32> 374 %5 = add nuw nsw <32 x i32> %3, %4 375 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 376 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 377 %8 = trunc <32 x i32> %7 to <32 x i8> 378 store <32 x i8> %8, <32 x i8>* undef, align 4 379 ret void 380} 381 382define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) { 383; AVX512BW-LABEL: avg_v64i8_2: 384; AVX512BW: # BB#0: 385; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 386; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 387; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) 388; AVX512BW-NEXT: retq 389 %1 = load <64 x i8>, <64 x i8>* %a 390 %2 = load <64 x i8>, <64 x i8>* %b 391 %3 = zext <64 x i8> %1 to <64 x i32> 392 %4 = zext <64 x i8> %2 to <64 x i32> 393 %5 = add nuw nsw <64 x i32> %4, %4 394 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 395 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 396 %8 = trunc <64 x i32> %7 to <64 x i8> 397 store <64 x i8> %8, <64 x i8>* undef, align 4 398 ret void 399} 400 401 402define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) { 403; SSE2-LABEL: avg_v4i16_2: 404; SSE2: # BB#0: 405; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 406; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 407; SSE2-NEXT: pavgw %xmm0, %xmm1 408; SSE2-NEXT: movq %xmm1, (%rax) 409; SSE2-NEXT: retq 410; 411; AVX2-LABEL: avg_v4i16_2: 412; AVX2: # BB#0: 413; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 414; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 415; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 416; AVX2-NEXT: vmovq %xmm0, (%rax) 417; AVX2-NEXT: retq 418; 419; AVX512BW-LABEL: avg_v4i16_2: 420; AVX512BW: # BB#0: 421; AVX512BW-NEXT: vmovq (%rdi), %xmm0 422; AVX512BW-NEXT: vmovq (%rsi), %xmm1 423; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0 424; AVX512BW-NEXT: vmovq %xmm0, (%rax) 425; AVX512BW-NEXT: retq 426 %1 = load <4 x i16>, <4 x i16>* %a 427 %2 = load <4 x i16>, <4 x i16>* %b 428 %3 = zext <4 x i16> %1 to <4 x i32> 429 %4 = zext <4 x i16> %2 to <4 x i32> 430 %5 = add nuw nsw <4 x i32> %3, %4 431 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1> 432 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 433 %8 = trunc <4 x i32> %7 to <4 x i16> 434 store <4 x i16> %8, <4 x i16>* undef, align 4 435 ret void 436} 437 438define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { 439; SSE2-LABEL: avg_v8i16_2: 440; SSE2: # BB#0: 441; SSE2-NEXT: movdqa (%rdi), %xmm0 442; SSE2-NEXT: pavgw (%rsi), %xmm0 443; SSE2-NEXT: movdqu %xmm0, (%rax) 444; SSE2-NEXT: retq 445; 446; AVX-LABEL: avg_v8i16_2: 447; AVX: # BB#0: 448; AVX-NEXT: vmovdqa (%rdi), %xmm0 449; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 450; AVX-NEXT: vmovdqu %xmm0, (%rax) 451; AVX-NEXT: retq 452 %1 = load <8 x i16>, <8 x i16>* %a 453 %2 = load <8 x i16>, <8 x i16>* %b 454 %3 = zext <8 x i16> %1 to <8 x i32> 455 %4 = zext <8 x i16> %2 to <8 x i32> 456 %5 = add nuw nsw <8 x i32> %3, %4 457 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 458 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 459 %8 = trunc <8 x i32> %7 to <8 x i16> 460 store <8 x i16> %8, <8 x i16>* undef, align 4 461 ret void 462} 463 464define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { 465; AVX2-LABEL: avg_v16i16_2: 466; AVX2: # BB#0: 467; AVX2-NEXT: vmovdqa (%rdi), %ymm0 468; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 469; AVX2-NEXT: vmovdqu %ymm0, (%rax) 470; AVX2-NEXT: vzeroupper 471; AVX2-NEXT: retq 472; 473; AVX512BW-LABEL: avg_v16i16_2: 474; AVX512BW: # BB#0: 475; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 476; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0 477; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) 478; AVX512BW-NEXT: retq 479 %1 = load <16 x i16>, <16 x i16>* %a 480 %2 = load <16 x i16>, <16 x i16>* %b 481 %3 = zext <16 x i16> %1 to <16 x i32> 482 %4 = zext <16 x i16> %2 to <16 x i32> 483 %5 = add nuw nsw <16 x i32> %3, %4 484 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 485 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 486 %8 = trunc <16 x i32> %7 to <16 x i16> 487 store <16 x i16> %8, <16 x i16>* undef, align 4 488 ret void 489} 490 491define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { 492; AVX512BW-LABEL: avg_v32i16_2: 493; AVX512BW: # BB#0: 494; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 495; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 496; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) 497; AVX512BW-NEXT: retq 498 %1 = load <32 x i16>, <32 x i16>* %a 499 %2 = load <32 x i16>, <32 x i16>* %b 500 %3 = zext <32 x i16> %1 to <32 x i32> 501 %4 = zext <32 x i16> %2 to <32 x i32> 502 %5 = add nuw nsw <32 x i32> %3, %4 503 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 504 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 505 %8 = trunc <32 x i32> %7 to <32 x i16> 506 store <32 x i16> %8, <32 x i16>* undef, align 4 507 ret void 508} 509 510define void @avg_v4i8_const(<4 x i8>* %a) { 511; SSE2-LABEL: avg_v4i8_const: 512; SSE2: # BB#0: 513; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 514; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 515; SSE2-NEXT: movd %xmm0, (%rax) 516; SSE2-NEXT: retq 517; 518; AVX2-LABEL: avg_v4i8_const: 519; AVX2: # BB#0: 520; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 521; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 522; AVX2-NEXT: vmovd %xmm0, (%rax) 523; AVX2-NEXT: retq 524; 525; AVX512BW-LABEL: avg_v4i8_const: 526; AVX512BW: # BB#0: 527; AVX512BW-NEXT: vmovd (%rdi), %xmm0 528; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 529; AVX512BW-NEXT: vmovd %xmm0, (%rax) 530; AVX512BW-NEXT: retq 531 %1 = load <4 x i8>, <4 x i8>* %a 532 %2 = zext <4 x i8> %1 to <4 x i32> 533 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4> 534 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 535 %5 = trunc <4 x i32> %4 to <4 x i8> 536 store <4 x i8> %5, <4 x i8>* undef, align 4 537 ret void 538} 539 540define void @avg_v8i8_const(<8 x i8>* %a) { 541; SSE2-LABEL: avg_v8i8_const: 542; SSE2: # BB#0: 543; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 544; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 545; SSE2-NEXT: movq %xmm0, (%rax) 546; SSE2-NEXT: retq 547; 548; AVX2-LABEL: avg_v8i8_const: 549; AVX2: # BB#0: 550; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 551; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 552; AVX2-NEXT: vmovq %xmm0, (%rax) 553; AVX2-NEXT: retq 554; 555; AVX512BW-LABEL: avg_v8i8_const: 556; AVX512BW: # BB#0: 557; AVX512BW-NEXT: vmovq (%rdi), %xmm0 558; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 559; AVX512BW-NEXT: vmovq %xmm0, (%rax) 560; AVX512BW-NEXT: retq 561 %1 = load <8 x i8>, <8 x i8>* %a 562 %2 = zext <8 x i8> %1 to <8 x i32> 563 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 564 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 565 %5 = trunc <8 x i32> %4 to <8 x i8> 566 store <8 x i8> %5, <8 x i8>* undef, align 4 567 ret void 568} 569 570define void @avg_v16i8_const(<16 x i8>* %a) { 571; SSE2-LABEL: avg_v16i8_const: 572; SSE2: # BB#0: 573; SSE2-NEXT: movdqa (%rdi), %xmm0 574; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 575; SSE2-NEXT: movdqu %xmm0, (%rax) 576; SSE2-NEXT: retq 577; 578; AVX-LABEL: avg_v16i8_const: 579; AVX: # BB#0: 580; AVX-NEXT: vmovdqa (%rdi), %xmm0 581; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 582; AVX-NEXT: vmovdqu %xmm0, (%rax) 583; AVX-NEXT: retq 584 %1 = load <16 x i8>, <16 x i8>* %a 585 %2 = zext <16 x i8> %1 to <16 x i32> 586 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 587 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 588 %5 = trunc <16 x i32> %4 to <16 x i8> 589 store <16 x i8> %5, <16 x i8>* undef, align 4 590 ret void 591} 592 593define void @avg_v32i8_const(<32 x i8>* %a) { 594; AVX2-LABEL: avg_v32i8_const: 595; AVX2: # BB#0: 596; AVX2-NEXT: vmovdqa (%rdi), %ymm0 597; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 598; AVX2-NEXT: vmovdqu %ymm0, (%rax) 599; AVX2-NEXT: vzeroupper 600; AVX2-NEXT: retq 601; 602; AVX512BW-LABEL: avg_v32i8_const: 603; AVX512BW: # BB#0: 604; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 605; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 606; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) 607; AVX512BW-NEXT: retq 608 %1 = load <32 x i8>, <32 x i8>* %a 609 %2 = zext <32 x i8> %1 to <32 x i32> 610 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 611 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 612 %5 = trunc <32 x i32> %4 to <32 x i8> 613 store <32 x i8> %5, <32 x i8>* undef, align 4 614 ret void 615} 616 617define void @avg_v64i8_const(<64 x i8>* %a) { 618; AVX512BW-LABEL: avg_v64i8_const: 619; AVX512BW: # BB#0: 620; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 621; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0 622; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) 623; AVX512BW-NEXT: retq 624 %1 = load <64 x i8>, <64 x i8>* %a 625 %2 = zext <64 x i8> %1 to <64 x i32> 626 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 627 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 628 %5 = trunc <64 x i32> %4 to <64 x i8> 629 store <64 x i8> %5, <64 x i8>* undef, align 4 630 ret void 631} 632 633define void @avg_v4i16_const(<4 x i16>* %a) { 634; SSE2-LABEL: avg_v4i16_const: 635; SSE2: # BB#0: 636; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 637; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 638; SSE2-NEXT: movq %xmm0, (%rax) 639; SSE2-NEXT: retq 640; 641; AVX2-LABEL: avg_v4i16_const: 642; AVX2: # BB#0: 643; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 644; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 645; AVX2-NEXT: vmovq %xmm0, (%rax) 646; AVX2-NEXT: retq 647; 648; AVX512BW-LABEL: avg_v4i16_const: 649; AVX512BW: # BB#0: 650; AVX512BW-NEXT: vmovq (%rdi), %xmm0 651; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 652; AVX512BW-NEXT: vmovq %xmm0, (%rax) 653; AVX512BW-NEXT: retq 654 %1 = load <4 x i16>, <4 x i16>* %a 655 %2 = zext <4 x i16> %1 to <4 x i32> 656 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4> 657 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 658 %5 = trunc <4 x i32> %4 to <4 x i16> 659 store <4 x i16> %5, <4 x i16>* undef, align 4 660 ret void 661} 662 663define void @avg_v8i16_const(<8 x i16>* %a) { 664; SSE2-LABEL: avg_v8i16_const: 665; SSE2: # BB#0: 666; SSE2-NEXT: movdqa (%rdi), %xmm0 667; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 668; SSE2-NEXT: movdqu %xmm0, (%rax) 669; SSE2-NEXT: retq 670; 671; AVX-LABEL: avg_v8i16_const: 672; AVX: # BB#0: 673; AVX-NEXT: vmovdqa (%rdi), %xmm0 674; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 675; AVX-NEXT: vmovdqu %xmm0, (%rax) 676; AVX-NEXT: retq 677 %1 = load <8 x i16>, <8 x i16>* %a 678 %2 = zext <8 x i16> %1 to <8 x i32> 679 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 680 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 681 %5 = trunc <8 x i32> %4 to <8 x i16> 682 store <8 x i16> %5, <8 x i16>* undef, align 4 683 ret void 684} 685 686define void @avg_v16i16_const(<16 x i16>* %a) { 687; AVX2-LABEL: avg_v16i16_const: 688; AVX2: # BB#0: 689; AVX2-NEXT: vmovdqa (%rdi), %ymm0 690; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 691; AVX2-NEXT: vmovdqu %ymm0, (%rax) 692; AVX2-NEXT: vzeroupper 693; AVX2-NEXT: retq 694; 695; AVX512BW-LABEL: avg_v16i16_const: 696; AVX512BW: # BB#0: 697; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 698; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 699; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) 700; AVX512BW-NEXT: retq 701 %1 = load <16 x i16>, <16 x i16>* %a 702 %2 = zext <16 x i16> %1 to <16 x i32> 703 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 704 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 705 %5 = trunc <16 x i32> %4 to <16 x i16> 706 store <16 x i16> %5, <16 x i16>* undef, align 4 707 ret void 708} 709 710define void @avg_v32i16_const(<32 x i16>* %a) { 711; AVX512BW-LABEL: avg_v32i16_const: 712; AVX512BW: # BB#0: 713; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 714; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0 715; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) 716; AVX512BW-NEXT: retq 717 %1 = load <32 x i16>, <32 x i16>* %a 718 %2 = zext <32 x i16> %1 to <32 x i32> 719 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 720 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 721 %5 = trunc <32 x i32> %4 to <32 x i16> 722 store <32 x i16> %5, <32 x i16>* undef, align 4 723 ret void 724} 725