1//===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the various vector pseudo instructions used by the
11// compiler, as well as Pat patterns used during instruction selection.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// No op bitconverts
17//===----------------------------------------------------------------------===//
18
19// Bitcasts between 128-bit vector types. Return the original type since
20// no instruction is needed for the conversion
21def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
22def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
23def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
24def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
25def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
26def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
27def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
28def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
29def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
30def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
31def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
32def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
33def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
34def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
35def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
36def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
37def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
38def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
39def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
40def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
41def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
42def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
43def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
44def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
45def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
46def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
47def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
48def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
49def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
50def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
51
52// Bitcasts between 256-bit vector types. Return the original type since
53// no instruction is needed for the conversion
54def : Pat<(v4i64  (bitconvert (v8i32  VR256:$src))), (v4i64  VR256:$src)>;
55def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64  VR256:$src)>;
56def : Pat<(v4i64  (bitconvert (v32i8  VR256:$src))), (v4i64  VR256:$src)>;
57def : Pat<(v4i64  (bitconvert (v8f32  VR256:$src))), (v4i64  VR256:$src)>;
58def : Pat<(v4i64  (bitconvert (v4f64  VR256:$src))), (v4i64  VR256:$src)>;
59def : Pat<(v8i32  (bitconvert (v4i64  VR256:$src))), (v8i32  VR256:$src)>;
60def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32  VR256:$src)>;
61def : Pat<(v8i32  (bitconvert (v32i8  VR256:$src))), (v8i32  VR256:$src)>;
62def : Pat<(v8i32  (bitconvert (v4f64  VR256:$src))), (v8i32  VR256:$src)>;
63def : Pat<(v8i32  (bitconvert (v8f32  VR256:$src))), (v8i32  VR256:$src)>;
64def : Pat<(v16i16 (bitconvert (v4i64  VR256:$src))), (v16i16 VR256:$src)>;
65def : Pat<(v16i16 (bitconvert (v8i32  VR256:$src))), (v16i16 VR256:$src)>;
66def : Pat<(v16i16 (bitconvert (v32i8  VR256:$src))), (v16i16 VR256:$src)>;
67def : Pat<(v16i16 (bitconvert (v4f64  VR256:$src))), (v16i16 VR256:$src)>;
68def : Pat<(v16i16 (bitconvert (v8f32  VR256:$src))), (v16i16 VR256:$src)>;
69def : Pat<(v32i8  (bitconvert (v4i64  VR256:$src))), (v32i8  VR256:$src)>;
70def : Pat<(v32i8  (bitconvert (v8i32  VR256:$src))), (v32i8  VR256:$src)>;
71def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8  VR256:$src)>;
72def : Pat<(v32i8  (bitconvert (v4f64  VR256:$src))), (v32i8  VR256:$src)>;
73def : Pat<(v32i8  (bitconvert (v8f32  VR256:$src))), (v32i8  VR256:$src)>;
74def : Pat<(v8f32  (bitconvert (v4i64  VR256:$src))), (v8f32  VR256:$src)>;
75def : Pat<(v8f32  (bitconvert (v8i32  VR256:$src))), (v8f32  VR256:$src)>;
76def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32  VR256:$src)>;
77def : Pat<(v8f32  (bitconvert (v32i8  VR256:$src))), (v8f32  VR256:$src)>;
78def : Pat<(v8f32  (bitconvert (v4f64  VR256:$src))), (v8f32  VR256:$src)>;
79def : Pat<(v4f64  (bitconvert (v4i64  VR256:$src))), (v4f64  VR256:$src)>;
80def : Pat<(v4f64  (bitconvert (v8i32  VR256:$src))), (v4f64  VR256:$src)>;
81def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64  VR256:$src)>;
82def : Pat<(v4f64  (bitconvert (v32i8  VR256:$src))), (v4f64  VR256:$src)>;
83def : Pat<(v4f64  (bitconvert (v8f32  VR256:$src))), (v4f64  VR256:$src)>;
84
85// Bitcasts between 512-bit vector types. Return the original type since
86// no instruction is needed for the conversion.
87def : Pat<(v8f64  (bitconvert (v8i64  VR512:$src))), (v8f64  VR512:$src)>;
88def : Pat<(v8f64  (bitconvert (v16i32 VR512:$src))), (v8f64  VR512:$src)>;
89def : Pat<(v8f64  (bitconvert (v32i16 VR512:$src))), (v8f64  VR512:$src)>;
90def : Pat<(v8f64  (bitconvert (v64i8  VR512:$src))), (v8f64  VR512:$src)>;
91def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64  VR512:$src)>;
92def : Pat<(v16f32 (bitconvert (v8i64  VR512:$src))), (v16f32 VR512:$src)>;
93def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
94def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
95def : Pat<(v16f32 (bitconvert (v64i8  VR512:$src))), (v16f32 VR512:$src)>;
96def : Pat<(v16f32 (bitconvert (v8f64  VR512:$src))), (v16f32 VR512:$src)>;
97def : Pat<(v8i64  (bitconvert (v16i32 VR512:$src))), (v8i64  VR512:$src)>;
98def : Pat<(v8i64  (bitconvert (v32i16 VR512:$src))), (v8i64  VR512:$src)>;
99def : Pat<(v8i64  (bitconvert (v64i8  VR512:$src))), (v8i64  VR512:$src)>;
100def : Pat<(v8i64  (bitconvert (v8f64  VR512:$src))), (v8i64  VR512:$src)>;
101def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64  VR512:$src)>;
102def : Pat<(v16i32 (bitconvert (v8i64  VR512:$src))), (v16i32 VR512:$src)>;
103def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
104def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>;
105def : Pat<(v16i32 (bitconvert (v64i8  VR512:$src))), (v16i32 VR512:$src)>;
106def : Pat<(v16i32 (bitconvert (v8f64  VR512:$src))), (v16i32 VR512:$src)>;
107def : Pat<(v32i16 (bitconvert (v8i64  VR512:$src))), (v32i16 VR512:$src)>;
108def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
109def : Pat<(v32i16 (bitconvert (v64i8  VR512:$src))), (v32i16 VR512:$src)>;
110def : Pat<(v32i16 (bitconvert (v8f64  VR512:$src))), (v32i16 VR512:$src)>;
111def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
112def : Pat<(v64i8  (bitconvert (v8i64  VR512:$src))), (v64i8  VR512:$src)>;
113def : Pat<(v64i8  (bitconvert (v16i32 VR512:$src))), (v64i8  VR512:$src)>;
114def : Pat<(v64i8  (bitconvert (v32i16 VR512:$src))), (v64i8  VR512:$src)>;
115def : Pat<(v64i8  (bitconvert (v8f64  VR512:$src))), (v64i8  VR512:$src)>;
116def : Pat<(v64i8  (bitconvert (v16f32 VR512:$src))), (v64i8  VR512:$src)>;
117
118
119//===----------------------------------------------------------------------===//
120//  Non-instruction patterns
121//===----------------------------------------------------------------------===//
122
123// A vector extract of the first f32/f64 position is a subregister copy
124def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
125          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
126def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
127          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
128
129// Implicitly promote a 32-bit scalar to a vector.
130def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
131          (COPY_TO_REGCLASS FR32:$src, VR128)>;
132// Implicitly promote a 64-bit scalar to a vector.
133def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
134          (COPY_TO_REGCLASS FR64:$src, VR128)>;
135
136
137//===----------------------------------------------------------------------===//
138// Subvector tricks
139//===----------------------------------------------------------------------===//
140
141// Patterns for insert_subvector/extract_subvector to/from index=0
142multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
143                                     RegisterClass RC, ValueType VT,
144                                     SubRegIndex subIdx> {
145  def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
146            (subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
147
148  def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
149            (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
150}
151
152// A 128-bit subvector extract from the first 256-bit vector position is a
153// subregister copy that needs no instruction. Likewise, a 128-bit subvector
154// insert to the first 256-bit vector position is a subregister copy that needs
155// no instruction.
156defm : subvector_subreg_lowering<VR128, v4i32, VR256, v8i32,  sub_xmm>;
157defm : subvector_subreg_lowering<VR128, v4f32, VR256, v8f32,  sub_xmm>;
158defm : subvector_subreg_lowering<VR128, v2i64, VR256, v4i64,  sub_xmm>;
159defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64,  sub_xmm>;
160defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
161defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8,  sub_xmm>;
162
163// A 128-bit subvector extract from the first 512-bit vector position is a
164// subregister copy that needs no instruction. Likewise, a 128-bit subvector
165// insert to the first 512-bit vector position is a subregister copy that needs
166// no instruction.
167defm : subvector_subreg_lowering<VR128, v4i32, VR512, v16i32, sub_xmm>;
168defm : subvector_subreg_lowering<VR128, v4f32, VR512, v16f32, sub_xmm>;
169defm : subvector_subreg_lowering<VR128, v2i64, VR512, v8i64,  sub_xmm>;
170defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64,  sub_xmm>;
171defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
172defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8,  sub_xmm>;
173
174// A 128-bit subvector extract from the first 512-bit vector position is a
175// subregister copy that needs no instruction. Likewise, a 128-bit subvector
176// insert to the first 512-bit vector position is a subregister copy that needs
177// no instruction.
178defm : subvector_subreg_lowering<VR256, v8i32,  VR512, v16i32, sub_ymm>;
179defm : subvector_subreg_lowering<VR256, v8f32,  VR512, v16f32, sub_ymm>;
180defm : subvector_subreg_lowering<VR256, v4i64,  VR512, v8i64,  sub_ymm>;
181defm : subvector_subreg_lowering<VR256, v4f64,  VR512, v8f64,  sub_ymm>;
182defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
183defm : subvector_subreg_lowering<VR256, v32i8,  VR512, v64i8,  sub_ymm>;
184
185
186multiclass subvector_store_lowering<string AlignedStr, string UnalignedStr,
187                                    RegisterClass RC, ValueType DstTy,
188                                    ValueType SrcTy, SubRegIndex SubIdx> {
189  def : Pat<(alignedstore (DstTy (extract_subvector
190                                  (SrcTy RC:$src), (iPTR 0))), addr:$dst),
191            (!cast<Instruction>("VMOV"#AlignedStr#"mr") addr:$dst,
192             (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>;
193
194  def : Pat<(store (DstTy (extract_subvector
195                           (SrcTy RC:$src), (iPTR 0))), addr:$dst),
196            (!cast<Instruction>("VMOV"#UnalignedStr#"mr") addr:$dst,
197             (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>;
198}
199
200let Predicates = [HasAVX, NoVLX] in {
201  defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64,  sub_xmm>;
202  defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32,  sub_xmm>;
203  defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64,  sub_xmm>;
204  defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32,  sub_xmm>;
205  defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>;
206  defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8,  sub_xmm>;
207}
208
209let Predicates = [HasVLX] in {
210  // Special patterns for storing subvector extracts of lower 128-bits
211  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
212  defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64,
213                                  sub_xmm>;
214  defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32,
215                                  sub_xmm>;
216  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64,
217                                  v4i64, sub_xmm>;
218  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32,
219                                  v8i32, sub_xmm>;
220  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16,
221                                  v16i16, sub_xmm>;
222  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8,
223                                  v32i8, sub_xmm>;
224
225  // Special patterns for storing subvector extracts of lower 128-bits of 512.
226  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
227  defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64,
228                                  sub_xmm>;
229  defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32,
230                                  sub_xmm>;
231  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64,
232                                  v8i64, sub_xmm>;
233  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32,
234                                  v16i32, sub_xmm>;
235  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16,
236                                  v32i16, sub_xmm>;
237  defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8,
238                                  v64i8, sub_xmm>;
239
240  // Special patterns for storing subvector extracts of lower 256-bits of 512.
241  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
242  defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64,
243                                  sub_ymm>;
244  defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32,
245                                  sub_ymm>;
246  defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64,
247                                  v8i64, sub_ymm>;
248  defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32,
249                                  v16i32, sub_ymm>;
250  defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16,
251                                  v32i16, sub_ymm>;
252  defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8,
253                                  v64i8, sub_ymm>;
254}
255
256// If we're inserting into an all zeros vector, just use a plain move which
257// will zero the upper bits. A post-isel hook will take care of removing
258// any moves that we can prove are unnecessary.
259multiclass subvec_zero_lowering<string MoveStr,
260                                RegisterClass RC, ValueType DstTy,
261                                ValueType SrcTy, ValueType ZeroTy,
262                                SubRegIndex SubIdx> {
263  def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
264                                     (SrcTy RC:$src), (iPTR 0))),
265            (SUBREG_TO_REG (i64 0),
266             (SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>;
267}
268
269let Predicates = [HasAVX, NoVLX] in {
270  defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, sub_xmm>;
271  defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, sub_xmm>;
272  defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, sub_xmm>;
273  defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, sub_xmm>;
274  defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, sub_xmm>;
275  defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, sub_xmm>;
276}
277
278let Predicates = [HasVLX] in {
279  defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, sub_xmm>;
280  defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, sub_xmm>;
281  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, sub_xmm>;
282  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, sub_xmm>;
283  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, sub_xmm>;
284  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, sub_xmm>;
285
286  defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, sub_xmm>;
287  defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, sub_xmm>;
288  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, sub_xmm>;
289  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, sub_xmm>;
290  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, sub_xmm>;
291  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, sub_xmm>;
292
293  defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, sub_ymm>;
294  defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, sub_ymm>;
295  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, sub_ymm>;
296  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, sub_ymm>;
297  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, sub_ymm>;
298  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, sub_ymm>;
299}
300
301let Predicates = [HasAVX512, NoVLX] in {
302  defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, sub_xmm>;
303  defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, sub_xmm>;
304  defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, sub_xmm>;
305  defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, sub_xmm>;
306  defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, sub_xmm>;
307  defm : subvec_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, sub_xmm>;
308
309  defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, sub_ymm>;
310  defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, sub_ymm>;
311  defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, sub_ymm>;
312  defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, sub_ymm>;
313  defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, sub_ymm>;
314  defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>;
315}
316
317class maskzeroupper<ValueType vt, RegisterClass RC> :
318  PatLeaf<(vt RC:$src), [{
319    return isMaskZeroExtended(N);
320  }]>;
321
322def maskzeroupperv1i1  : maskzeroupper<v1i1,  VK1>;
323def maskzeroupperv2i1  : maskzeroupper<v2i1,  VK2>;
324def maskzeroupperv4i1  : maskzeroupper<v4i1,  VK4>;
325def maskzeroupperv8i1  : maskzeroupper<v8i1,  VK8>;
326def maskzeroupperv16i1 : maskzeroupper<v16i1, VK16>;
327def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>;
328
329// The patterns determine if we can depend on the upper bits of a mask register
330// being zeroed by the previous operation so that we can skip explicit
331// zeroing.
332let Predicates = [HasBWI] in {
333  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
334                                     maskzeroupperv1i1:$src, (iPTR 0))),
335            (COPY_TO_REGCLASS VK1:$src, VK32)>;
336  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
337                                     maskzeroupperv8i1:$src, (iPTR 0))),
338            (COPY_TO_REGCLASS VK8:$src, VK32)>;
339  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
340                                     maskzeroupperv16i1:$src, (iPTR 0))),
341            (COPY_TO_REGCLASS VK16:$src, VK32)>;
342
343  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
344                                     maskzeroupperv1i1:$src, (iPTR 0))),
345            (COPY_TO_REGCLASS VK1:$src, VK64)>;
346  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
347                                     maskzeroupperv8i1:$src, (iPTR 0))),
348            (COPY_TO_REGCLASS VK8:$src, VK64)>;
349  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
350                                     maskzeroupperv16i1:$src, (iPTR 0))),
351            (COPY_TO_REGCLASS VK16:$src, VK64)>;
352  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
353                                     maskzeroupperv32i1:$src, (iPTR 0))),
354            (COPY_TO_REGCLASS VK32:$src, VK64)>;
355}
356
357let Predicates = [HasAVX512] in {
358  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
359                                     maskzeroupperv1i1:$src, (iPTR 0))),
360            (COPY_TO_REGCLASS VK1:$src, VK16)>;
361  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
362                                     maskzeroupperv8i1:$src, (iPTR 0))),
363            (COPY_TO_REGCLASS VK8:$src, VK16)>;
364}
365
366let Predicates = [HasDQI] in {
367  def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
368                                    maskzeroupperv1i1:$src, (iPTR 0))),
369            (COPY_TO_REGCLASS VK1:$src, VK8)>;
370}
371
372let Predicates = [HasVLX, HasDQI] in {
373  def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
374                                    maskzeroupperv2i1:$src, (iPTR 0))),
375            (COPY_TO_REGCLASS VK2:$src, VK8)>;
376  def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
377                                    maskzeroupperv4i1:$src, (iPTR 0))),
378            (COPY_TO_REGCLASS VK4:$src, VK8)>;
379}
380
381let Predicates = [HasVLX] in {
382  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
383                                     maskzeroupperv2i1:$src, (iPTR 0))),
384            (COPY_TO_REGCLASS VK2:$src, VK16)>;
385  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
386                                     maskzeroupperv4i1:$src, (iPTR 0))),
387            (COPY_TO_REGCLASS VK4:$src, VK16)>;
388}
389
390let Predicates = [HasBWI, HasVLX] in {
391  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
392                                     maskzeroupperv2i1:$src, (iPTR 0))),
393            (COPY_TO_REGCLASS VK2:$src, VK32)>;
394  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
395                                     maskzeroupperv4i1:$src, (iPTR 0))),
396            (COPY_TO_REGCLASS VK4:$src, VK32)>;
397  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
398                                     maskzeroupperv2i1:$src, (iPTR 0))),
399            (COPY_TO_REGCLASS VK2:$src, VK64)>;
400  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
401                                     maskzeroupperv4i1:$src, (iPTR 0))),
402            (COPY_TO_REGCLASS VK4:$src, VK64)>;
403}
404
405// If the bits are not zero we have to fall back to explicitly zeroing by
406// using shifts.
407let Predicates = [HasAVX512] in {
408  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
409                                     (v1i1 VK1:$mask), (iPTR 0))),
410            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK1:$mask, VK16),
411                                    (i8 15)), (i8 15))>;
412
413  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
414                                     (v2i1 VK2:$mask), (iPTR 0))),
415            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16),
416                                    (i8 14)), (i8 14))>;
417
418  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
419                                     (v4i1 VK4:$mask), (iPTR 0))),
420            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16),
421                                    (i8 12)), (i8 12))>;
422}
423
424let Predicates = [HasAVX512, NoDQI] in {
425  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
426                                     (v8i1 VK8:$mask), (iPTR 0))),
427            (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK8:$mask, VK16),
428                                    (i8 8)), (i8 8))>;
429}
430
431let Predicates = [HasDQI] in {
432  def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
433                                     (v8i1 VK8:$mask), (iPTR 0))),
434            (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK16)>;
435
436  def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
437                                    (v1i1 VK1:$mask), (iPTR 0))),
438            (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK1:$mask, VK8),
439                                    (i8 7)), (i8 7))>;
440  def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
441                                    (v2i1 VK2:$mask), (iPTR 0))),
442            (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK2:$mask, VK8),
443                                    (i8 6)), (i8 6))>;
444  def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
445                                    (v4i1 VK4:$mask), (iPTR 0))),
446            (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK4:$mask, VK8),
447                                    (i8 4)), (i8 4))>;
448}
449
450let Predicates = [HasBWI] in {
451  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
452                                     (v16i1 VK16:$mask), (iPTR 0))),
453            (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK32)>;
454
455  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
456                                     (v16i1 VK16:$mask), (iPTR 0))),
457            (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK64)>;
458  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
459                                     (v32i1 VK32:$mask), (iPTR 0))),
460            (COPY_TO_REGCLASS (KMOVDkk VK32:$mask), VK64)>;
461}
462
463let Predicates = [HasBWI, NoDQI] in {
464  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
465                                     (v8i1 VK8:$mask), (iPTR 0))),
466            (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK8:$mask, VK32),
467                                    (i8 24)), (i8 24))>;
468
469  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
470                                     (v8i1 VK8:$mask), (iPTR 0))),
471            (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK8:$mask, VK64),
472                                    (i8 56)), (i8 56))>;
473}
474
475let Predicates = [HasBWI, HasDQI] in {
476  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
477                                     (v8i1 VK8:$mask), (iPTR 0))),
478            (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK32)>;
479
480  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
481                                     (v8i1 VK8:$mask), (iPTR 0))),
482            (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>;
483}
484
485let Predicates = [HasBWI, HasVLX] in {
486  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
487                                     (v1i1 VK1:$mask), (iPTR 0))),
488            (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32),
489                                    (i8 31)), (i8 31))>;
490  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
491                                     (v2i1 VK2:$mask), (iPTR 0))),
492            (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK2:$mask, VK32),
493                                    (i8 30)), (i8 30))>;
494  def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
495                                     (v4i1 VK4:$mask), (iPTR 0))),
496            (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK4:$mask, VK32),
497                                    (i8 28)), (i8 28))>;
498
499  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
500                                     (v1i1 VK1:$mask), (iPTR 0))),
501            (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK1:$mask, VK64),
502                                    (i8 63)), (i8 63))>;
503  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
504                                     (v2i1 VK2:$mask), (iPTR 0))),
505            (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK2:$mask, VK64),
506                                    (i8 62)), (i8 62))>;
507  def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
508                                     (v4i1 VK4:$mask), (iPTR 0))),
509            (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK4:$mask, VK64),
510                                    (i8 60)), (i8 60))>;
511}
512