1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7define <16 x i32> @stack_fold_vpdpwssd(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) {
8; CHECK-LABEL: stack_fold_vpdpwssd:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11; CHECK-NEXT:    #APP
12; CHECK-NEXT:    nop
13; CHECK-NEXT:    #NO_APP
14; CHECK-NEXT:    vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
15; CHECK-NEXT:    retq
16  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
17  %2 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2)
18  ret <16 x i32> %2
19}
20
21define <16 x i32> @stack_fold_vpdpwssd_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) {
22; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
23; CHECK:       # %bb.0:
24; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
25; CHECK-NEXT:    #APP
26; CHECK-NEXT:    nop
27; CHECK-NEXT:    #NO_APP
28; CHECK-NEXT:    vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
29; CHECK-NEXT:    retq
30  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
31  %2 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %a0, <16 x i32> %a2, <16 x i32> %a1)
32  ret <16 x i32> %2
33}
34
35define <16 x i32> @stack_fold_vpdpwssd_mask(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
36; CHECK-LABEL: stack_fold_vpdpwssd_mask:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
39; CHECK-NEXT:    #APP
40; CHECK-NEXT:    nop
41; CHECK-NEXT:    #NO_APP
42; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
43; CHECK-NEXT:    kmovd %esi, %k1
44; CHECK-NEXT:    vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
45; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
46; CHECK-NEXT:    retq
47  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
48  %2 = load <16 x i32>, <16 x i32>* %a0
49  %3 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %2, <16 x i32> %a1, <16 x i32> %a2)
50  %4 = bitcast i16 %mask to <16 x i1>
51  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %2
52  ret <16 x i32> %5
53}
54
55define <16 x i32> @stack_fold_vpdpwssd_mask_commuted(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
56; CHECK-LABEL: stack_fold_vpdpwssd_mask_commuted:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
59; CHECK-NEXT:    #APP
60; CHECK-NEXT:    nop
61; CHECK-NEXT:    #NO_APP
62; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
63; CHECK-NEXT:    kmovd %esi, %k1
64; CHECK-NEXT:    vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
65; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
66; CHECK-NEXT:    retq
67  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
68  %2 = load <16 x i32>, <16 x i32>* %a0
69  %3 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %2, <16 x i32> %a2, <16 x i32> %a1)
70  %4 = bitcast i16 %mask to <16 x i1>
71  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %2
72  ret <16 x i32> %5
73}
74
75define <16 x i32> @stack_fold_vpdpwssd_maskz(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) {
76; CHECK-LABEL: stack_fold_vpdpwssd_maskz:
77; CHECK:       # %bb.0:
78; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
79; CHECK-NEXT:    #APP
80; CHECK-NEXT:    nop
81; CHECK-NEXT:    #NO_APP
82; CHECK-NEXT:    kmovw (%rdi), %k1
83; CHECK-NEXT:    vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
84; CHECK-NEXT:    retq
85  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
86  %2 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2)
87  %3 = load i16, i16* %mask
88  %4 = bitcast i16 %3 to <16 x i1>
89  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
90  ret <16 x i32> %5
91}
92
93define <16 x i32> @stack_fold_vpdpwssd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) {
94; CHECK-LABEL: stack_fold_vpdpwssd_maskz_commuted:
95; CHECK:       # %bb.0:
96; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
97; CHECK-NEXT:    #APP
98; CHECK-NEXT:    nop
99; CHECK-NEXT:    #NO_APP
100; CHECK-NEXT:    kmovw (%rdi), %k1
101; CHECK-NEXT:    vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
102; CHECK-NEXT:    retq
103  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
104  %2 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %a0, <16 x i32> %a2, <16 x i32> %a1)
105  %3 = load i16, i16* %mask
106  %4 = bitcast i16 %3 to <16 x i1>
107  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
108  ret <16 x i32> %5
109}
110
111define <16 x i32> @stack_fold_vpdpwssds(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) {
112; CHECK-LABEL: stack_fold_vpdpwssds:
113; CHECK:       # %bb.0:
114; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
115; CHECK-NEXT:    #APP
116; CHECK-NEXT:    nop
117; CHECK-NEXT:    #NO_APP
118; CHECK-NEXT:    vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
119; CHECK-NEXT:    retq
120  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
121  %2 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2)
122  ret <16 x i32> %2
123}
124
125define <16 x i32> @stack_fold_vpdpwssds_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) {
126; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
127; CHECK:       # %bb.0:
128; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
129; CHECK-NEXT:    #APP
130; CHECK-NEXT:    nop
131; CHECK-NEXT:    #NO_APP
132; CHECK-NEXT:    vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
133; CHECK-NEXT:    retq
134  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
135  %2 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %a0, <16 x i32> %a2, <16 x i32> %a1)
136  ret <16 x i32> %2
137}
138
139define <16 x i32> @stack_fold_vpdpwssds_mask(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
140; CHECK-LABEL: stack_fold_vpdpwssds_mask:
141; CHECK:       # %bb.0:
142; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
143; CHECK-NEXT:    #APP
144; CHECK-NEXT:    nop
145; CHECK-NEXT:    #NO_APP
146; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
147; CHECK-NEXT:    kmovd %esi, %k1
148; CHECK-NEXT:    vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
149; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
150; CHECK-NEXT:    retq
151  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
152  %2 = load <16 x i32>, <16 x i32>* %a0
153  %3 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %2, <16 x i32> %a1, <16 x i32> %a2)
154  %4 = bitcast i16 %mask to <16 x i1>
155  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %2
156  ret <16 x i32> %5
157}
158
159define <16 x i32> @stack_fold_vpdpwssds_mask_commuted(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
160; CHECK-LABEL: stack_fold_vpdpwssds_mask_commuted:
161; CHECK:       # %bb.0:
162; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
163; CHECK-NEXT:    #APP
164; CHECK-NEXT:    nop
165; CHECK-NEXT:    #NO_APP
166; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
167; CHECK-NEXT:    kmovd %esi, %k1
168; CHECK-NEXT:    vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
169; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
170; CHECK-NEXT:    retq
171  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
172  %2 = load <16 x i32>, <16 x i32>* %a0
173  %3 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %2, <16 x i32> %a2, <16 x i32> %a1)
174  %4 = bitcast i16 %mask to <16 x i1>
175  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %2
176  ret <16 x i32> %5
177}
178
179define <16 x i32> @stack_fold_vpdpwssds_maskz(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) {
180; CHECK-LABEL: stack_fold_vpdpwssds_maskz:
181; CHECK:       # %bb.0:
182; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
183; CHECK-NEXT:    #APP
184; CHECK-NEXT:    nop
185; CHECK-NEXT:    #NO_APP
186; CHECK-NEXT:    kmovw (%rdi), %k1
187; CHECK-NEXT:    vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
188; CHECK-NEXT:    retq
189  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
190  %2 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2)
191  %3 = load i16, i16* %mask
192  %4 = bitcast i16 %3 to <16 x i1>
193  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
194  ret <16 x i32> %5
195}
196
197define <16 x i32> @stack_fold_vpdpwssds_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) {
198; CHECK-LABEL: stack_fold_vpdpwssds_maskz_commuted:
199; CHECK:       # %bb.0:
200; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
201; CHECK-NEXT:    #APP
202; CHECK-NEXT:    nop
203; CHECK-NEXT:    #NO_APP
204; CHECK-NEXT:    kmovw (%rdi), %k1
205; CHECK-NEXT:    vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
206; CHECK-NEXT:    retq
207  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
208  %2 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %a0, <16 x i32> %a2, <16 x i32> %a1)
209  %3 = load i16, i16* %mask
210  %4 = bitcast i16 %3 to <16 x i1>
211  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> zeroinitializer
212  ret <16 x i32> %5
213}
214
215declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
216declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
217