1 // Copyright 2019, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 //   * Redistributions of source code must retain the above copyright notice,
8 //     this list of conditions and the following disclaimer.
9 //   * Redistributions in binary form must reproduce the above copyright notice,
10 //     this list of conditions and the following disclaimer in the documentation
11 //     and/or other materials provided with the distribution.
12 //   * Neither the name of ARM Limited nor the names of its contributors may be
13 //     used to endorse or promote products derived from this software without
14 //     specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 
27 #include "macro-assembler-aarch64.h"
28 
29 namespace vixl {
30 namespace aarch64 {
31 
AddSubHelper(AddSubHelperOption option,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)32 void MacroAssembler::AddSubHelper(AddSubHelperOption option,
33                                   const ZRegister& zd,
34                                   const ZRegister& zn,
35                                   IntegerOperand imm) {
36   VIXL_ASSERT(imm.FitsInLane(zd));
37 
38   // Simple, encodable cases.
39   if (TrySingleAddSub(option, zd, zn, imm)) return;
40 
41   VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
42   bool add_imm = (option == kAddImmediate);
43 
44   // Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
45   // instruction. Also interpret the immediate as signed, so we can convert
46   // Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
47   IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
48   if (signed_imm.IsNegative()) {
49     AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
50     IntegerOperand n_imm(signed_imm.GetMagnitude());
51     // IntegerOperand can represent -INT_MIN, so this is always safe.
52     VIXL_ASSERT(n_imm.IsPositiveOrZero());
53     if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
54   }
55 
56   // Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
57   UseScratchRegisterScope temps(this);
58   ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
59   Dup(scratch, imm);
60 
61   SingleEmissionCheckScope guard(this);
62   if (add_imm) {
63     add(zd, zn, scratch);
64   } else {
65     sub(zd, zn, scratch);
66   }
67 }
68 
TrySingleAddSub(AddSubHelperOption option,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)69 bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
70                                      const ZRegister& zd,
71                                      const ZRegister& zn,
72                                      IntegerOperand imm) {
73   VIXL_ASSERT(imm.FitsInLane(zd));
74 
75   int imm8;
76   int shift = -1;
77   if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
78       imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
79     MovprfxHelperScope guard(this, zd, zn);
80     switch (option) {
81       case kAddImmediate:
82         add(zd, zd, imm8, shift);
83         return true;
84       case kSubImmediate:
85         sub(zd, zd, imm8, shift);
86         return true;
87     }
88   }
89   return false;
90 }
91 
IntWideImmHelper(IntWideImmFn imm_fn,SVEArithPredicatedFn reg_macro,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm,bool is_signed)92 void MacroAssembler::IntWideImmHelper(IntWideImmFn imm_fn,
93                                       SVEArithPredicatedFn reg_macro,
94                                       const ZRegister& zd,
95                                       const ZRegister& zn,
96                                       IntegerOperand imm,
97                                       bool is_signed) {
98   if (is_signed) {
99     // E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
100     if (imm.IsInt8()) {
101       MovprfxHelperScope guard(this, zd, zn);
102       (this->*imm_fn)(zd, zd, imm.AsInt8());
103       return;
104     }
105   } else {
106     // E.g. UMIN_z_zi, UMAX_z_zi
107     if (imm.IsUint8()) {
108       MovprfxHelperScope guard(this, zd, zn);
109       (this->*imm_fn)(zd, zd, imm.AsUint8());
110       return;
111     }
112   }
113 
114   UseScratchRegisterScope temps(this);
115   PRegister pg = temps.AcquireGoverningP();
116   Ptrue(pg.WithSameLaneSizeAs(zd));
117 
118   // Try to re-use zd if we can, so we can avoid a movprfx.
119   ZRegister scratch =
120       zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
121                      : zd;
122   Dup(scratch, imm);
123 
124   // The vector-form macro for commutative operations will swap the arguments to
125   // avoid movprfx, if necessary.
126   (this->*reg_macro)(zd, pg.Merging(), zn, scratch);
127 }
128 
Mul(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)129 void MacroAssembler::Mul(const ZRegister& zd,
130                          const ZRegister& zn,
131                          IntegerOperand imm) {
132   VIXL_ASSERT(allow_macro_instructions_);
133   IntWideImmFn imm_fn = &Assembler::mul;
134   SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
135   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
136 }
137 
Smin(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)138 void MacroAssembler::Smin(const ZRegister& zd,
139                           const ZRegister& zn,
140                           IntegerOperand imm) {
141   VIXL_ASSERT(allow_macro_instructions_);
142   VIXL_ASSERT(imm.FitsInSignedLane(zd));
143   IntWideImmFn imm_fn = &Assembler::smin;
144   SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
145   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
146 }
147 
Smax(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)148 void MacroAssembler::Smax(const ZRegister& zd,
149                           const ZRegister& zn,
150                           IntegerOperand imm) {
151   VIXL_ASSERT(allow_macro_instructions_);
152   VIXL_ASSERT(imm.FitsInSignedLane(zd));
153   IntWideImmFn imm_fn = &Assembler::smax;
154   SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
155   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
156 }
157 
Umax(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)158 void MacroAssembler::Umax(const ZRegister& zd,
159                           const ZRegister& zn,
160                           IntegerOperand imm) {
161   VIXL_ASSERT(allow_macro_instructions_);
162   VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
163   IntWideImmFn imm_fn = &Assembler::umax;
164   SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
165   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
166 }
167 
Umin(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)168 void MacroAssembler::Umin(const ZRegister& zd,
169                           const ZRegister& zn,
170                           IntegerOperand imm) {
171   VIXL_ASSERT(allow_macro_instructions_);
172   VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
173   IntWideImmFn imm_fn = &Assembler::umin;
174   SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
175   IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
176 }
177 
Addpl(const Register & xd,const Register & xn,int64_t multiplier)178 void MacroAssembler::Addpl(const Register& xd,
179                            const Register& xn,
180                            int64_t multiplier) {
181   VIXL_ASSERT(allow_macro_instructions_);
182 
183   // This macro relies on `Rdvl` to handle some out-of-range cases. Check that
184   // `VL * multiplier` cannot overflow, for any possible value of VL.
185   VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
186   VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
187 
188   if (xd.IsZero()) return;
189   if (xn.IsZero() && xd.IsSP()) {
190     // TODO: This operation doesn't make much sense, but we could support it
191     // with a scratch register if necessary.
192     VIXL_UNIMPLEMENTED();
193   }
194 
195   // Handling xzr requires an extra move, so defer it until later so we can try
196   // to use `rdvl` instead (via `Addvl`).
197   if (IsInt6(multiplier) && !xn.IsZero()) {
198     SingleEmissionCheckScope guard(this);
199     addpl(xd, xn, static_cast<int>(multiplier));
200     return;
201   }
202 
203   // If `multiplier` is a multiple of 8, we can use `Addvl` instead.
204   if ((multiplier % kZRegBitsPerPRegBit) == 0) {
205     Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
206     return;
207   }
208 
209   if (IsInt6(multiplier)) {
210     VIXL_ASSERT(xn.IsZero());  // Other cases were handled with `addpl`.
211     // There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
212     // materialise a zero.
213     MacroEmissionCheckScope guard(this);
214     movz(xd, 0);
215     addpl(xd, xd, static_cast<int>(multiplier));
216     return;
217   }
218 
219   // TODO: Some probable cases result in rather long sequences. For example,
220   // `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
221   // outside the encodable range. We should look for ways to cover such cases
222   // without drastically increasing the complexity of this logic.
223 
224   // For other cases, calculate xn + (PL * multiplier) using discrete
225   // instructions. This requires two scratch registers in the general case, so
226   // try to re-use the destination as a scratch register.
227   UseScratchRegisterScope temps(this);
228   temps.Include(xd);
229   temps.Exclude(xn);
230 
231   Register scratch = temps.AcquireX();
232   // Because there is no `rdpl`, so we have to calculate PL from VL. We can't
233   // scale the multiplier because (we already know) it isn't a multiple of 8.
234   Rdvl(scratch, multiplier);
235 
236   MacroEmissionCheckScope guard(this);
237   if (xn.IsZero()) {
238     asr(xd, scratch, kZRegBitsPerPRegBitLog2);
239   } else if (xd.IsSP() || xn.IsSP()) {
240     // TODO: MacroAssembler::Add should be able to handle this.
241     asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
242     add(xd, xn, scratch);
243   } else {
244     add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
245   }
246 }
247 
Addvl(const Register & xd,const Register & xn,int64_t multiplier)248 void MacroAssembler::Addvl(const Register& xd,
249                            const Register& xn,
250                            int64_t multiplier) {
251   VIXL_ASSERT(allow_macro_instructions_);
252   VIXL_ASSERT(xd.IsX());
253   VIXL_ASSERT(xn.IsX());
254 
255   // Check that `VL * multiplier` cannot overflow, for any possible value of VL.
256   VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
257   VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
258 
259   if (xd.IsZero()) return;
260   if (xn.IsZero() && xd.IsSP()) {
261     // TODO: This operation doesn't make much sense, but we could support it
262     // with a scratch register if necessary. `rdvl` cannot write into `sp`.
263     VIXL_UNIMPLEMENTED();
264   }
265 
266   if (IsInt6(multiplier)) {
267     SingleEmissionCheckScope guard(this);
268     if (xn.IsZero()) {
269       rdvl(xd, static_cast<int>(multiplier));
270     } else {
271       addvl(xd, xn, static_cast<int>(multiplier));
272     }
273     return;
274   }
275 
276   // TODO: Some probable cases result in rather long sequences. For example,
277   // `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
278   // outside the encodable range. We should look for ways to cover such cases
279   // without drastically increasing the complexity of this logic.
280 
281   // For other cases, calculate xn + (VL * multiplier) using discrete
282   // instructions. This requires two scratch registers in the general case, so
283   // we try to re-use the destination as a scratch register.
284   UseScratchRegisterScope temps(this);
285   temps.Include(xd);
286   temps.Exclude(xn);
287 
288   Register a = temps.AcquireX();
289   Mov(a, multiplier);
290 
291   MacroEmissionCheckScope guard(this);
292   Register b = temps.AcquireX();
293   rdvl(b, 1);
294   if (xn.IsZero()) {
295     mul(xd, a, b);
296   } else if (xd.IsSP() || xn.IsSP()) {
297     mul(a, a, b);
298     add(xd, xn, a);
299   } else {
300     madd(xd, a, b, xn);
301   }
302 }
303 
CalculateSVEAddress(const Register & xd,const SVEMemOperand & addr,int vl_divisor_log2)304 void MacroAssembler::CalculateSVEAddress(const Register& xd,
305                                          const SVEMemOperand& addr,
306                                          int vl_divisor_log2) {
307   VIXL_ASSERT(allow_macro_instructions_);
308   VIXL_ASSERT(!addr.IsScatterGather());
309   VIXL_ASSERT(xd.IsX());
310 
311   // The lower bound is where a whole Z register is accessed.
312   VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
313   // The upper bound is for P register accesses, and for instructions like
314   // "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
315   VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));
316 
317   SVEOffsetModifier mod = addr.GetOffsetModifier();
318   Register base = addr.GetScalarBase();
319 
320   if (addr.IsEquivalentToScalar()) {
321     // For example:
322     //   [x0]
323     //   [x0, #0]
324     //   [x0, xzr, LSL 2]
325     Mov(xd, base);
326   } else if (addr.IsScalarPlusImmediate()) {
327     // For example:
328     //   [x0, #42]
329     //   [x0, #42, MUL VL]
330     int64_t offset = addr.GetImmediateOffset();
331     VIXL_ASSERT(offset != 0);  // Handled by IsEquivalentToScalar.
332     if (addr.IsMulVl()) {
333       int vl_divisor = 1 << vl_divisor_log2;
334       // For all possible values of vl_divisor, we can simply use `Addpl`. This
335       // will select `addvl` if necessary.
336       VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
337       Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
338     } else {
339       // IsScalarPlusImmediate() ensures that no other modifiers can occur.
340       VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
341       Add(xd, base, offset);
342     }
343   } else if (addr.IsScalarPlusScalar()) {
344     // For example:
345     //   [x0, x1]
346     //   [x0, x1, LSL #4]
347     Register offset = addr.GetScalarOffset();
348     VIXL_ASSERT(!offset.IsZero());  // Handled by IsEquivalentToScalar.
349     if (mod == SVE_LSL) {
350       Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
351     } else {
352       // IsScalarPlusScalar() ensures that no other modifiers can occur.
353       VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
354       Add(xd, base, offset);
355     }
356   } else {
357     // All other forms are scatter-gather addresses, which cannot be evaluated
358     // into an X register.
359     VIXL_UNREACHABLE();
360   }
361 }
362 
Cpy(const ZRegister & zd,const PRegister & pg,IntegerOperand imm)363 void MacroAssembler::Cpy(const ZRegister& zd,
364                          const PRegister& pg,
365                          IntegerOperand imm) {
366   VIXL_ASSERT(allow_macro_instructions_);
367   VIXL_ASSERT(imm.FitsInLane(zd));
368   int imm8;
369   int shift;
370   if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
371       imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
372     SingleEmissionCheckScope guard(this);
373     cpy(zd, pg, imm8, shift);
374     return;
375   }
376 
377   // The fallbacks rely on `cpy` variants that only support merging predication.
378   // If zeroing predication was requested, zero the destination first.
379   if (pg.IsZeroing()) {
380     SingleEmissionCheckScope guard(this);
381     dup(zd, 0);
382   }
383   PRegisterM pg_m = pg.Merging();
384 
385   // Try to encode the immediate using fcpy.
386   VIXL_ASSERT(imm.FitsInLane(zd));
387   if (zd.GetLaneSizeInBits() >= kHRegSize) {
388     double fp_imm = 0.0;
389     switch (zd.GetLaneSizeInBits()) {
390       case kHRegSize:
391         fp_imm =
392             FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
393         break;
394       case kSRegSize:
395         fp_imm = RawbitsToFloat(imm.AsUint32());
396         break;
397       case kDRegSize:
398         fp_imm = RawbitsToDouble(imm.AsUint64());
399         break;
400       default:
401         VIXL_UNREACHABLE();
402         break;
403     }
404     // IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
405     // we can use IsImmFP64 for all lane sizes.
406     if (IsImmFP64(fp_imm)) {
407       SingleEmissionCheckScope guard(this);
408       fcpy(zd, pg_m, fp_imm);
409       return;
410     }
411   }
412 
413   // Fall back to using a scratch register.
414   UseScratchRegisterScope temps(this);
415   Register scratch = temps.AcquireRegisterToHoldLane(zd);
416   Mov(scratch, imm);
417 
418   SingleEmissionCheckScope guard(this);
419   cpy(zd, pg_m, scratch);
420 }
421 
422 // TODO: We implement Fcpy (amongst other things) for all FP types because it
423 // allows us to preserve user-specified NaNs. We should come up with some
424 // FPImmediate type to abstract this, and avoid all the duplication below (and
425 // elsewhere).
426 
Fcpy(const ZRegister & zd,const PRegisterM & pg,double imm)427 void MacroAssembler::Fcpy(const ZRegister& zd,
428                           const PRegisterM& pg,
429                           double imm) {
430   VIXL_ASSERT(allow_macro_instructions_);
431   VIXL_ASSERT(pg.IsMerging());
432 
433   if (IsImmFP64(imm)) {
434     SingleEmissionCheckScope guard(this);
435     fcpy(zd, pg, imm);
436     return;
437   }
438 
439   // As a fall-back, cast the immediate to the required lane size, and try to
440   // encode the bit pattern using `Cpy`.
441   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
442 }
443 
Fcpy(const ZRegister & zd,const PRegisterM & pg,float imm)444 void MacroAssembler::Fcpy(const ZRegister& zd,
445                           const PRegisterM& pg,
446                           float imm) {
447   VIXL_ASSERT(allow_macro_instructions_);
448   VIXL_ASSERT(pg.IsMerging());
449 
450   if (IsImmFP32(imm)) {
451     SingleEmissionCheckScope guard(this);
452     fcpy(zd, pg, imm);
453     return;
454   }
455 
456   // As a fall-back, cast the immediate to the required lane size, and try to
457   // encode the bit pattern using `Cpy`.
458   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
459 }
460 
Fcpy(const ZRegister & zd,const PRegisterM & pg,Float16 imm)461 void MacroAssembler::Fcpy(const ZRegister& zd,
462                           const PRegisterM& pg,
463                           Float16 imm) {
464   VIXL_ASSERT(allow_macro_instructions_);
465   VIXL_ASSERT(pg.IsMerging());
466 
467   if (IsImmFP16(imm)) {
468     SingleEmissionCheckScope guard(this);
469     fcpy(zd, pg, imm);
470     return;
471   }
472 
473   // As a fall-back, cast the immediate to the required lane size, and try to
474   // encode the bit pattern using `Cpy`.
475   Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
476 }
477 
Dup(const ZRegister & zd,IntegerOperand imm)478 void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
479   VIXL_ASSERT(allow_macro_instructions_);
480   VIXL_ASSERT(imm.FitsInLane(zd));
481   unsigned lane_size = zd.GetLaneSizeInBits();
482   int imm8;
483   int shift;
484   if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
485       imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
486     SingleEmissionCheckScope guard(this);
487     dup(zd, imm8, shift);
488   } else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
489     SingleEmissionCheckScope guard(this);
490     dupm(zd, imm.AsUintN(lane_size));
491   } else {
492     UseScratchRegisterScope temps(this);
493     Register scratch = temps.AcquireRegisterToHoldLane(zd);
494     Mov(scratch, imm);
495 
496     SingleEmissionCheckScope guard(this);
497     dup(zd, scratch);
498   }
499 }
500 
NoncommutativeArithmeticHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,SVEArithPredicatedFn fn,SVEArithPredicatedFn rev_fn)501 void MacroAssembler::NoncommutativeArithmeticHelper(
502     const ZRegister& zd,
503     const PRegisterM& pg,
504     const ZRegister& zn,
505     const ZRegister& zm,
506     SVEArithPredicatedFn fn,
507     SVEArithPredicatedFn rev_fn) {
508   if (zd.Aliases(zn)) {
509     // E.g. zd = zd / zm
510     SingleEmissionCheckScope guard(this);
511     (this->*fn)(zd, pg, zn, zm);
512   } else if (zd.Aliases(zm)) {
513     // E.g. zd = zn / zd
514     SingleEmissionCheckScope guard(this);
515     (this->*rev_fn)(zd, pg, zm, zn);
516   } else {
517     // E.g. zd = zn / zm
518     MovprfxHelperScope guard(this, zd, pg, zn);
519     (this->*fn)(zd, pg, zd, zm);
520   }
521 }
522 
FPCommutativeArithmeticHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,SVEArithPredicatedFn fn,FPMacroNaNPropagationOption nan_option)523 void MacroAssembler::FPCommutativeArithmeticHelper(
524     const ZRegister& zd,
525     const PRegisterM& pg,
526     const ZRegister& zn,
527     const ZRegister& zm,
528     SVEArithPredicatedFn fn,
529     FPMacroNaNPropagationOption nan_option) {
530   ResolveFPNaNPropagationOption(&nan_option);
531 
532   if (zd.Aliases(zn)) {
533     SingleEmissionCheckScope guard(this);
534     (this->*fn)(zd, pg, zd, zm);
535   } else if (zd.Aliases(zm)) {
536     switch (nan_option) {
537       case FastNaNPropagation: {
538         // Swap the arguments.
539         SingleEmissionCheckScope guard(this);
540         (this->*fn)(zd, pg, zd, zn);
541         return;
542       }
543       case StrictNaNPropagation: {
544         UseScratchRegisterScope temps(this);
545         // Use a scratch register to keep the argument order exactly as
546         // specified.
547         ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
548         {
549           MovprfxHelperScope guard(this, scratch, pg, zn);
550           (this->*fn)(scratch, pg, scratch, zm);
551         }
552         Mov(zd, scratch);
553         return;
554       }
555       case NoFPMacroNaNPropagationSelected:
556         VIXL_UNREACHABLE();
557         return;
558     }
559   } else {
560     MovprfxHelperScope guard(this, zd, pg, zn);
561     (this->*fn)(zd, pg, zd, zm);
562   }
563 }
564 
Asr(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)565 void MacroAssembler::Asr(const ZRegister& zd,
566                          const PRegisterM& pg,
567                          const ZRegister& zn,
568                          const ZRegister& zm) {
569   VIXL_ASSERT(allow_macro_instructions_);
570   NoncommutativeArithmeticHelper(zd,
571                                  pg,
572                                  zn,
573                                  zm,
574                                  static_cast<SVEArithPredicatedFn>(
575                                      &Assembler::asr),
576                                  static_cast<SVEArithPredicatedFn>(
577                                      &Assembler::asrr));
578 }
579 
Lsl(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)580 void MacroAssembler::Lsl(const ZRegister& zd,
581                          const PRegisterM& pg,
582                          const ZRegister& zn,
583                          const ZRegister& zm) {
584   VIXL_ASSERT(allow_macro_instructions_);
585   NoncommutativeArithmeticHelper(zd,
586                                  pg,
587                                  zn,
588                                  zm,
589                                  static_cast<SVEArithPredicatedFn>(
590                                      &Assembler::lsl),
591                                  static_cast<SVEArithPredicatedFn>(
592                                      &Assembler::lslr));
593 }
594 
Lsr(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)595 void MacroAssembler::Lsr(const ZRegister& zd,
596                          const PRegisterM& pg,
597                          const ZRegister& zn,
598                          const ZRegister& zm) {
599   VIXL_ASSERT(allow_macro_instructions_);
600   NoncommutativeArithmeticHelper(zd,
601                                  pg,
602                                  zn,
603                                  zm,
604                                  static_cast<SVEArithPredicatedFn>(
605                                      &Assembler::lsr),
606                                  static_cast<SVEArithPredicatedFn>(
607                                      &Assembler::lsrr));
608 }
609 
Fdiv(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)610 void MacroAssembler::Fdiv(const ZRegister& zd,
611                           const PRegisterM& pg,
612                           const ZRegister& zn,
613                           const ZRegister& zm) {
614   VIXL_ASSERT(allow_macro_instructions_);
615   NoncommutativeArithmeticHelper(zd,
616                                  pg,
617                                  zn,
618                                  zm,
619                                  static_cast<SVEArithPredicatedFn>(
620                                      &Assembler::fdiv),
621                                  static_cast<SVEArithPredicatedFn>(
622                                      &Assembler::fdivr));
623 }
624 
Fsub(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)625 void MacroAssembler::Fsub(const ZRegister& zd,
626                           const PRegisterM& pg,
627                           const ZRegister& zn,
628                           const ZRegister& zm) {
629   VIXL_ASSERT(allow_macro_instructions_);
630   NoncommutativeArithmeticHelper(zd,
631                                  pg,
632                                  zn,
633                                  zm,
634                                  static_cast<SVEArithPredicatedFn>(
635                                      &Assembler::fsub),
636                                  static_cast<SVEArithPredicatedFn>(
637                                      &Assembler::fsubr));
638 }
639 
Fadd(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)640 void MacroAssembler::Fadd(const ZRegister& zd,
641                           const PRegisterM& pg,
642                           const ZRegister& zn,
643                           const ZRegister& zm,
644                           FPMacroNaNPropagationOption nan_option) {
645   VIXL_ASSERT(allow_macro_instructions_);
646   FPCommutativeArithmeticHelper(zd,
647                                 pg,
648                                 zn,
649                                 zm,
650                                 static_cast<SVEArithPredicatedFn>(
651                                     &Assembler::fadd),
652                                 nan_option);
653 }
654 
Fabd(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)655 void MacroAssembler::Fabd(const ZRegister& zd,
656                           const PRegisterM& pg,
657                           const ZRegister& zn,
658                           const ZRegister& zm,
659                           FPMacroNaNPropagationOption nan_option) {
660   VIXL_ASSERT(allow_macro_instructions_);
661   FPCommutativeArithmeticHelper(zd,
662                                 pg,
663                                 zn,
664                                 zm,
665                                 static_cast<SVEArithPredicatedFn>(
666                                     &Assembler::fabd),
667                                 nan_option);
668 }
669 
Fmul(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)670 void MacroAssembler::Fmul(const ZRegister& zd,
671                           const PRegisterM& pg,
672                           const ZRegister& zn,
673                           const ZRegister& zm,
674                           FPMacroNaNPropagationOption nan_option) {
675   VIXL_ASSERT(allow_macro_instructions_);
676   FPCommutativeArithmeticHelper(zd,
677                                 pg,
678                                 zn,
679                                 zm,
680                                 static_cast<SVEArithPredicatedFn>(
681                                     &Assembler::fmul),
682                                 nan_option);
683 }
684 
Fmulx(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)685 void MacroAssembler::Fmulx(const ZRegister& zd,
686                            const PRegisterM& pg,
687                            const ZRegister& zn,
688                            const ZRegister& zm,
689                            FPMacroNaNPropagationOption nan_option) {
690   VIXL_ASSERT(allow_macro_instructions_);
691   FPCommutativeArithmeticHelper(zd,
692                                 pg,
693                                 zn,
694                                 zm,
695                                 static_cast<SVEArithPredicatedFn>(
696                                     &Assembler::fmulx),
697                                 nan_option);
698 }
699 
Fmax(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)700 void MacroAssembler::Fmax(const ZRegister& zd,
701                           const PRegisterM& pg,
702                           const ZRegister& zn,
703                           const ZRegister& zm,
704                           FPMacroNaNPropagationOption nan_option) {
705   VIXL_ASSERT(allow_macro_instructions_);
706   FPCommutativeArithmeticHelper(zd,
707                                 pg,
708                                 zn,
709                                 zm,
710                                 static_cast<SVEArithPredicatedFn>(
711                                     &Assembler::fmax),
712                                 nan_option);
713 }
714 
Fmin(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)715 void MacroAssembler::Fmin(const ZRegister& zd,
716                           const PRegisterM& pg,
717                           const ZRegister& zn,
718                           const ZRegister& zm,
719                           FPMacroNaNPropagationOption nan_option) {
720   VIXL_ASSERT(allow_macro_instructions_);
721   FPCommutativeArithmeticHelper(zd,
722                                 pg,
723                                 zn,
724                                 zm,
725                                 static_cast<SVEArithPredicatedFn>(
726                                     &Assembler::fmin),
727                                 nan_option);
728 }
729 
Fmaxnm(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)730 void MacroAssembler::Fmaxnm(const ZRegister& zd,
731                             const PRegisterM& pg,
732                             const ZRegister& zn,
733                             const ZRegister& zm,
734                             FPMacroNaNPropagationOption nan_option) {
735   VIXL_ASSERT(allow_macro_instructions_);
736   FPCommutativeArithmeticHelper(zd,
737                                 pg,
738                                 zn,
739                                 zm,
740                                 static_cast<SVEArithPredicatedFn>(
741                                     &Assembler::fmaxnm),
742                                 nan_option);
743 }
744 
Fminnm(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)745 void MacroAssembler::Fminnm(const ZRegister& zd,
746                             const PRegisterM& pg,
747                             const ZRegister& zn,
748                             const ZRegister& zm,
749                             FPMacroNaNPropagationOption nan_option) {
750   VIXL_ASSERT(allow_macro_instructions_);
751   FPCommutativeArithmeticHelper(zd,
752                                 pg,
753                                 zn,
754                                 zm,
755                                 static_cast<SVEArithPredicatedFn>(
756                                     &Assembler::fminnm),
757                                 nan_option);
758 }
759 
Fdup(const ZRegister & zd,double imm)760 void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
761   VIXL_ASSERT(allow_macro_instructions_);
762 
763   switch (zd.GetLaneSizeInBits()) {
764     case kHRegSize:
765       Fdup(zd, Float16(imm));
766       break;
767     case kSRegSize:
768       Fdup(zd, static_cast<float>(imm));
769       break;
770     case kDRegSize:
771       if (IsImmFP64(imm)) {
772         SingleEmissionCheckScope guard(this);
773         fdup(zd, imm);
774       } else {
775         Dup(zd, DoubleToRawbits(imm));
776       }
777       break;
778   }
779 }
780 
Fdup(const ZRegister & zd,float imm)781 void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
782   VIXL_ASSERT(allow_macro_instructions_);
783 
784   switch (zd.GetLaneSizeInBits()) {
785     case kHRegSize:
786       Fdup(zd, Float16(imm));
787       break;
788     case kSRegSize:
789       if (IsImmFP32(imm)) {
790         SingleEmissionCheckScope guard(this);
791         fdup(zd, imm);
792       } else {
793         Dup(zd, FloatToRawbits(imm));
794       }
795       break;
796     case kDRegSize:
797       Fdup(zd, static_cast<double>(imm));
798       break;
799   }
800 }
801 
Fdup(const ZRegister & zd,Float16 imm)802 void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
803   VIXL_ASSERT(allow_macro_instructions_);
804 
805   switch (zd.GetLaneSizeInBits()) {
806     case kHRegSize:
807       if (IsImmFP16(imm)) {
808         SingleEmissionCheckScope guard(this);
809         fdup(zd, imm);
810       } else {
811         Dup(zd, Float16ToRawbits(imm));
812       }
813       break;
814     case kSRegSize:
815       Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
816       break;
817     case kDRegSize:
818       Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
819       break;
820   }
821 }
822 
Index(const ZRegister & zd,const Operand & start,const Operand & step)823 void MacroAssembler::Index(const ZRegister& zd,
824                            const Operand& start,
825                            const Operand& step) {
826   class IndexOperand : public Operand {
827    public:
828     static IndexOperand Prepare(MacroAssembler* masm,
829                                 UseScratchRegisterScope* temps,
830                                 const Operand& op,
831                                 const ZRegister& zd) {
832       // Look for encodable immediates.
833       int imm;
834       if (op.IsImmediate()) {
835         if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd, &imm)) {
836           return IndexOperand(imm);
837         }
838         Register scratch = temps->AcquireRegisterToHoldLane(zd);
839         masm->Mov(scratch, op);
840         return IndexOperand(scratch);
841       } else {
842         // Plain registers can be encoded directly.
843         VIXL_ASSERT(op.IsPlainRegister());
844         return IndexOperand(op.GetRegister());
845       }
846     }
847 
848     int GetImm5() const {
849       int64_t imm = GetImmediate();
850       VIXL_ASSERT(IsInt5(imm));
851       return static_cast<int>(imm);
852     }
853 
854    private:
855     explicit IndexOperand(const Register& reg) : Operand(reg) {}
856     explicit IndexOperand(int64_t imm) : Operand(imm) {}
857   };
858 
859   UseScratchRegisterScope temps(this);
860   IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
861   IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);
862 
863   SingleEmissionCheckScope guard(this);
864   if (start_enc.IsImmediate()) {
865     if (step_enc.IsImmediate()) {
866       index(zd, start_enc.GetImm5(), step_enc.GetImm5());
867     } else {
868       index(zd, start_enc.GetImm5(), step_enc.GetRegister());
869     }
870   } else {
871     if (step_enc.IsImmediate()) {
872       index(zd, start_enc.GetRegister(), step_enc.GetImm5());
873     } else {
874       index(zd, start_enc.GetRegister(), step_enc.GetRegister());
875     }
876   }
877 }
878 
Insr(const ZRegister & zdn,IntegerOperand imm)879 void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
880   VIXL_ASSERT(allow_macro_instructions_);
881   VIXL_ASSERT(imm.FitsInLane(zdn));
882 
883   if (imm.IsZero()) {
884     SingleEmissionCheckScope guard(this);
885     insr(zdn, xzr);
886     return;
887   }
888 
889   UseScratchRegisterScope temps(this);
890   Register scratch = temps.AcquireRegisterToHoldLane(zdn);
891 
892   // TODO: There are many cases where we could optimise immediates, such as by
893   // detecting repeating patterns or FP immediates. We should optimise and
894   // abstract this for use in other SVE mov-immediate-like macros.
895   Mov(scratch, imm);
896 
897   SingleEmissionCheckScope guard(this);
898   insr(zdn, scratch);
899 }
900 
Mla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)901 void MacroAssembler::Mla(const ZRegister& zd,
902                          const PRegisterM& pg,
903                          const ZRegister& za,
904                          const ZRegister& zn,
905                          const ZRegister& zm) {
906   VIXL_ASSERT(allow_macro_instructions_);
907   if (zd.Aliases(za)) {
908     // zda = zda + (zn * zm)
909     SingleEmissionCheckScope guard(this);
910     mla(zd, pg, zn, zm);
911   } else if (zd.Aliases(zn)) {
912     // zdn = za + (zdn * zm)
913     SingleEmissionCheckScope guard(this);
914     mad(zd, pg, zm, za);
915   } else if (zd.Aliases(zm)) {
916     // Multiplication is commutative, so we can swap zn and zm.
917     // zdm = za + (zdm * zn)
918     SingleEmissionCheckScope guard(this);
919     mad(zd, pg, zn, za);
920   } else {
921     // zd = za + (zn * zm)
922     ExactAssemblyScope guard(this, 2 * kInstructionSize);
923     movprfx(zd, pg, za);
924     mla(zd, pg, zn, zm);
925   }
926 }
927 
Mls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)928 void MacroAssembler::Mls(const ZRegister& zd,
929                          const PRegisterM& pg,
930                          const ZRegister& za,
931                          const ZRegister& zn,
932                          const ZRegister& zm) {
933   VIXL_ASSERT(allow_macro_instructions_);
934   if (zd.Aliases(za)) {
935     // zda = zda - (zn * zm)
936     SingleEmissionCheckScope guard(this);
937     mls(zd, pg, zn, zm);
938   } else if (zd.Aliases(zn)) {
939     // zdn = za - (zdn * zm)
940     SingleEmissionCheckScope guard(this);
941     msb(zd, pg, zm, za);
942   } else if (zd.Aliases(zm)) {
943     // Multiplication is commutative, so we can swap zn and zm.
944     // zdm = za - (zdm * zn)
945     SingleEmissionCheckScope guard(this);
946     msb(zd, pg, zn, za);
947   } else {
948     // zd = za - (zn * zm)
949     ExactAssemblyScope guard(this, 2 * kInstructionSize);
950     movprfx(zd, pg, za);
951     mls(zd, pg, zn, zm);
952   }
953 }
954 
CompareHelper(Condition cond,const PRegisterWithLaneSize & pd,const PRegisterZ & pg,const ZRegister & zn,IntegerOperand imm)955 void MacroAssembler::CompareHelper(Condition cond,
956                                    const PRegisterWithLaneSize& pd,
957                                    const PRegisterZ& pg,
958                                    const ZRegister& zn,
959                                    IntegerOperand imm) {
960   UseScratchRegisterScope temps(this);
961   ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
962   Dup(zm, imm);
963   SingleEmissionCheckScope guard(this);
964   cmp(cond, pd, pg, zn, zm);
965 }
966 
Pfirst(const PRegisterWithLaneSize & pd,const PRegister & pg,const PRegisterWithLaneSize & pn)967 void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
968                             const PRegister& pg,
969                             const PRegisterWithLaneSize& pn) {
970   VIXL_ASSERT(allow_macro_instructions_);
971   VIXL_ASSERT(pd.IsLaneSizeB());
972   VIXL_ASSERT(pn.IsLaneSizeB());
973   if (pd.Is(pn)) {
974     SingleEmissionCheckScope guard(this);
975     pfirst(pd, pg, pn);
976   } else {
977     UseScratchRegisterScope temps(this);
978     PRegister temp_pg = pg;
979     if (pd.Aliases(pg)) {
980       temp_pg = temps.AcquireP();
981       Mov(temp_pg.VnB(), pg.VnB());
982     }
983     Mov(pd, pn);
984     SingleEmissionCheckScope guard(this);
985     pfirst(pd, temp_pg, pd);
986   }
987 }
988 
Pnext(const PRegisterWithLaneSize & pd,const PRegister & pg,const PRegisterWithLaneSize & pn)989 void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
990                            const PRegister& pg,
991                            const PRegisterWithLaneSize& pn) {
992   VIXL_ASSERT(allow_macro_instructions_);
993   VIXL_ASSERT(AreSameFormat(pd, pn));
994   if (pd.Is(pn)) {
995     SingleEmissionCheckScope guard(this);
996     pnext(pd, pg, pn);
997   } else {
998     UseScratchRegisterScope temps(this);
999     PRegister temp_pg = pg;
1000     if (pd.Aliases(pg)) {
1001       temp_pg = temps.AcquireP();
1002       Mov(temp_pg.VnB(), pg.VnB());
1003     }
1004     Mov(pd.VnB(), pn.VnB());
1005     SingleEmissionCheckScope guard(this);
1006     pnext(pd, temp_pg, pd);
1007   }
1008 }
1009 
Ptrue(const PRegisterWithLaneSize & pd,SVEPredicateConstraint pattern,FlagsUpdate s)1010 void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
1011                            SVEPredicateConstraint pattern,
1012                            FlagsUpdate s) {
1013   VIXL_ASSERT(allow_macro_instructions_);
1014   switch (s) {
1015     case LeaveFlags:
1016       Ptrue(pd, pattern);
1017       return;
1018     case SetFlags:
1019       Ptrues(pd, pattern);
1020       return;
1021   }
1022   VIXL_UNREACHABLE();
1023 }
1024 
Sdiv(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)1025 void MacroAssembler::Sdiv(const ZRegister& zd,
1026                           const PRegisterM& pg,
1027                           const ZRegister& zn,
1028                           const ZRegister& zm) {
1029   VIXL_ASSERT(allow_macro_instructions_);
1030   NoncommutativeArithmeticHelper(zd,
1031                                  pg,
1032                                  zn,
1033                                  zm,
1034                                  static_cast<SVEArithPredicatedFn>(
1035                                      &Assembler::sdiv),
1036                                  static_cast<SVEArithPredicatedFn>(
1037                                      &Assembler::sdivr));
1038 }
1039 
Sub(const ZRegister & zd,IntegerOperand imm,const ZRegister & zm)1040 void MacroAssembler::Sub(const ZRegister& zd,
1041                          IntegerOperand imm,
1042                          const ZRegister& zm) {
1043   VIXL_ASSERT(allow_macro_instructions_);
1044 
1045   int imm8;
1046   int shift = -1;
1047   if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
1048       imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
1049     MovprfxHelperScope guard(this, zd, zm);
1050     subr(zd, zd, imm8, shift);
1051   } else {
1052     UseScratchRegisterScope temps(this);
1053     ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
1054     Dup(scratch, imm);
1055 
1056     SingleEmissionCheckScope guard(this);
1057     sub(zd, scratch, zm);
1058   }
1059 }
1060 
Sub(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)1061 void MacroAssembler::Sub(const ZRegister& zd,
1062                          const PRegisterM& pg,
1063                          const ZRegister& zn,
1064                          const ZRegister& zm) {
1065   VIXL_ASSERT(allow_macro_instructions_);
1066   NoncommutativeArithmeticHelper(zd,
1067                                  pg,
1068                                  zn,
1069                                  zm,
1070                                  static_cast<SVEArithPredicatedFn>(
1071                                      &Assembler::sub),
1072                                  static_cast<SVEArithPredicatedFn>(
1073                                      &Assembler::subr));
1074 }
1075 
Udiv(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)1076 void MacroAssembler::Udiv(const ZRegister& zd,
1077                           const PRegisterM& pg,
1078                           const ZRegister& zn,
1079                           const ZRegister& zm) {
1080   VIXL_ASSERT(allow_macro_instructions_);
1081   NoncommutativeArithmeticHelper(zd,
1082                                  pg,
1083                                  zn,
1084                                  zm,
1085                                  static_cast<SVEArithPredicatedFn>(
1086                                      &Assembler::udiv),
1087                                  static_cast<SVEArithPredicatedFn>(
1088                                      &Assembler::udivr));
1089 }
1090 
SVELoadBroadcastImmHelper(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr,SVELoadBroadcastFn fn,int divisor)1091 void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
1092                                                const PRegisterZ& pg,
1093                                                const SVEMemOperand& addr,
1094                                                SVELoadBroadcastFn fn,
1095                                                int divisor) {
1096   VIXL_ASSERT(addr.IsScalarPlusImmediate());
1097   int64_t imm = addr.GetImmediateOffset();
1098   if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
1099     SingleEmissionCheckScope guard(this);
1100     (this->*fn)(zt, pg, addr);
1101   } else {
1102     UseScratchRegisterScope temps(this);
1103     Register scratch = temps.AcquireX();
1104     CalculateSVEAddress(scratch, addr, zt);
1105     SingleEmissionCheckScope guard(this);
1106     (this->*fn)(zt, pg, SVEMemOperand(scratch));
1107   }
1108 }
1109 
SVELoadStoreScalarImmHelper(const CPURegister & rt,const SVEMemOperand & addr,SVELoadStoreFn fn)1110 void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
1111                                                  const SVEMemOperand& addr,
1112                                                  SVELoadStoreFn fn) {
1113   VIXL_ASSERT(allow_macro_instructions_);
1114   VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());
1115 
1116   if (addr.IsPlainScalar() ||
1117       (addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
1118        addr.IsMulVl())) {
1119     SingleEmissionCheckScope guard(this);
1120     (this->*fn)(rt, addr);
1121     return;
1122   }
1123 
1124   if (addr.IsEquivalentToScalar()) {
1125     SingleEmissionCheckScope guard(this);
1126     (this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
1127     return;
1128   }
1129 
1130   UseScratchRegisterScope temps(this);
1131   Register scratch = temps.AcquireX();
1132   CalculateSVEAddress(scratch, addr, rt);
1133   SingleEmissionCheckScope guard(this);
1134   (this->*fn)(rt, SVEMemOperand(scratch));
1135 }
1136 
1137 template <typename Tg, typename Tf>
SVELoadStoreScalarImmHelper(const ZRegister & zt,const Tg & pg,const SVEMemOperand & addr,Tf fn,int imm_bits,int shift_amount,SVEOffsetModifier supported_modifier,int vl_divisor_log2)1138 void MacroAssembler::SVELoadStoreScalarImmHelper(
1139     const ZRegister& zt,
1140     const Tg& pg,
1141     const SVEMemOperand& addr,
1142     Tf fn,
1143     int imm_bits,
1144     int shift_amount,
1145     SVEOffsetModifier supported_modifier,
1146     int vl_divisor_log2) {
1147   VIXL_ASSERT(allow_macro_instructions_);
1148   int imm_divisor = 1 << shift_amount;
1149 
1150   if (addr.IsPlainScalar() ||
1151       (addr.IsScalarPlusImmediate() &&
1152        IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
1153        ((addr.GetImmediateOffset() % imm_divisor) == 0) &&
1154        (addr.GetOffsetModifier() == supported_modifier))) {
1155     SingleEmissionCheckScope guard(this);
1156     (this->*fn)(zt, pg, addr);
1157     return;
1158   }
1159 
1160   if (addr.IsEquivalentToScalar()) {
1161     SingleEmissionCheckScope guard(this);
1162     (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1163     return;
1164   }
1165 
1166   if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
1167       (vl_divisor_log2 == -1)) {
1168     // We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
1169     // dependent.
1170     VIXL_UNIMPLEMENTED();
1171   }
1172 
1173   UseScratchRegisterScope temps(this);
1174   Register scratch = temps.AcquireX();
1175   CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1176   SingleEmissionCheckScope guard(this);
1177   (this->*fn)(zt, pg, SVEMemOperand(scratch));
1178 }
1179 
1180 template <typename Tg, typename Tf>
SVELoadStore1Helper(int msize_in_bytes_log2,const ZRegister & zt,const Tg & pg,const SVEMemOperand & addr,Tf fn)1181 void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
1182                                          const ZRegister& zt,
1183                                          const Tg& pg,
1184                                          const SVEMemOperand& addr,
1185                                          Tf fn) {
1186   if (addr.IsPlainScalar() ||
1187       (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
1188        addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
1189       (addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
1190        addr.IsMulVl())) {
1191     SingleEmissionCheckScope guard(this);
1192     (this->*fn)(zt, pg, addr);
1193     return;
1194   }
1195 
1196   if (addr.IsEquivalentToScalar()) {
1197     SingleEmissionCheckScope guard(this);
1198     (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1199     return;
1200   }
1201 
1202   if (addr.IsVectorPlusImmediate()) {
1203     uint64_t offset = addr.GetImmediateOffset();
1204     if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
1205         IsUint5(offset >> msize_in_bytes_log2)) {
1206       SingleEmissionCheckScope guard(this);
1207       (this->*fn)(zt, pg, addr);
1208       return;
1209     }
1210   }
1211 
1212   if (addr.IsScalarPlusVector()) {
1213     VIXL_ASSERT(addr.IsScatterGather());
1214     SingleEmissionCheckScope guard(this);
1215     (this->*fn)(zt, pg, addr);
1216     return;
1217   }
1218 
1219   UseScratchRegisterScope temps(this);
1220   if (addr.IsScatterGather()) {
1221     // In scatter-gather modes, zt and zn/zm have the same lane size. However,
1222     // for 32-bit accesses, the result of each lane's address calculation still
1223     // requires 64 bits; we can't naively use `Adr` for the address calculation
1224     // because it would truncate each address to 32 bits.
1225 
1226     if (addr.IsVectorPlusImmediate()) {
1227       // Synthesise the immediate in an X register, then use a
1228       // scalar-plus-vector access with the original vector.
1229       Register scratch = temps.AcquireX();
1230       Mov(scratch, addr.GetImmediateOffset());
1231       SingleEmissionCheckScope guard(this);
1232       SVEOffsetModifier om =
1233           zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
1234       (this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
1235       return;
1236     }
1237 
1238     VIXL_UNIMPLEMENTED();
1239   } else {
1240     Register scratch = temps.AcquireX();
1241     // TODO: If we have an immediate offset that is a multiple of
1242     // msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
1243     // save an instruction.
1244     int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
1245     CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1246     SingleEmissionCheckScope guard(this);
1247     (this->*fn)(zt, pg, SVEMemOperand(scratch));
1248   }
1249 }
1250 
1251 template <typename Tf>
SVELoadFFHelper(int msize_in_bytes_log2,const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr,Tf fn)1252 void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
1253                                      const ZRegister& zt,
1254                                      const PRegisterZ& pg,
1255                                      const SVEMemOperand& addr,
1256                                      Tf fn) {
1257   if (addr.IsScatterGather()) {
1258     // Scatter-gather first-fault loads share encodings with normal loads.
1259     SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
1260     return;
1261   }
1262 
1263   // Contiguous first-faulting loads have no scalar-plus-immediate form at all,
1264   // so we don't do immediate synthesis.
1265 
1266   // We cannot currently distinguish "[x0]" from "[x0, #0]", and this
1267   // is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
1268   if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
1269                                addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
1270     SingleEmissionCheckScope guard(this);
1271     (this->*fn)(zt, pg, addr);
1272     return;
1273   }
1274 
1275   VIXL_UNIMPLEMENTED();
1276 }
1277 
Ld1b(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1278 void MacroAssembler::Ld1b(const ZRegister& zt,
1279                           const PRegisterZ& pg,
1280                           const SVEMemOperand& addr) {
1281   VIXL_ASSERT(allow_macro_instructions_);
1282   SVELoadStore1Helper(kBRegSizeInBytesLog2,
1283                       zt,
1284                       pg,
1285                       addr,
1286                       static_cast<SVELoad1Fn>(&Assembler::ld1b));
1287 }
1288 
Ld1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1289 void MacroAssembler::Ld1h(const ZRegister& zt,
1290                           const PRegisterZ& pg,
1291                           const SVEMemOperand& addr) {
1292   VIXL_ASSERT(allow_macro_instructions_);
1293   SVELoadStore1Helper(kHRegSizeInBytesLog2,
1294                       zt,
1295                       pg,
1296                       addr,
1297                       static_cast<SVELoad1Fn>(&Assembler::ld1h));
1298 }
1299 
Ld1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1300 void MacroAssembler::Ld1w(const ZRegister& zt,
1301                           const PRegisterZ& pg,
1302                           const SVEMemOperand& addr) {
1303   VIXL_ASSERT(allow_macro_instructions_);
1304   SVELoadStore1Helper(kWRegSizeInBytesLog2,
1305                       zt,
1306                       pg,
1307                       addr,
1308                       static_cast<SVELoad1Fn>(&Assembler::ld1w));
1309 }
1310 
Ld1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1311 void MacroAssembler::Ld1d(const ZRegister& zt,
1312                           const PRegisterZ& pg,
1313                           const SVEMemOperand& addr) {
1314   VIXL_ASSERT(allow_macro_instructions_);
1315   SVELoadStore1Helper(kDRegSizeInBytesLog2,
1316                       zt,
1317                       pg,
1318                       addr,
1319                       static_cast<SVELoad1Fn>(&Assembler::ld1d));
1320 }
1321 
Ld1sb(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1322 void MacroAssembler::Ld1sb(const ZRegister& zt,
1323                            const PRegisterZ& pg,
1324                            const SVEMemOperand& addr) {
1325   VIXL_ASSERT(allow_macro_instructions_);
1326   SVELoadStore1Helper(kBRegSizeInBytesLog2,
1327                       zt,
1328                       pg,
1329                       addr,
1330                       static_cast<SVELoad1Fn>(&Assembler::ld1sb));
1331 }
1332 
Ld1sh(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1333 void MacroAssembler::Ld1sh(const ZRegister& zt,
1334                            const PRegisterZ& pg,
1335                            const SVEMemOperand& addr) {
1336   VIXL_ASSERT(allow_macro_instructions_);
1337   SVELoadStore1Helper(kHRegSizeInBytesLog2,
1338                       zt,
1339                       pg,
1340                       addr,
1341                       static_cast<SVELoad1Fn>(&Assembler::ld1sh));
1342 }
1343 
Ld1sw(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1344 void MacroAssembler::Ld1sw(const ZRegister& zt,
1345                            const PRegisterZ& pg,
1346                            const SVEMemOperand& addr) {
1347   VIXL_ASSERT(allow_macro_instructions_);
1348   SVELoadStore1Helper(kSRegSizeInBytesLog2,
1349                       zt,
1350                       pg,
1351                       addr,
1352                       static_cast<SVELoad1Fn>(&Assembler::ld1sw));
1353 }
1354 
St1b(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1355 void MacroAssembler::St1b(const ZRegister& zt,
1356                           const PRegister& pg,
1357                           const SVEMemOperand& addr) {
1358   VIXL_ASSERT(allow_macro_instructions_);
1359   SVELoadStore1Helper(kBRegSizeInBytesLog2,
1360                       zt,
1361                       pg,
1362                       addr,
1363                       static_cast<SVEStore1Fn>(&Assembler::st1b));
1364 }
1365 
St1h(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1366 void MacroAssembler::St1h(const ZRegister& zt,
1367                           const PRegister& pg,
1368                           const SVEMemOperand& addr) {
1369   VIXL_ASSERT(allow_macro_instructions_);
1370   SVELoadStore1Helper(kHRegSizeInBytesLog2,
1371                       zt,
1372                       pg,
1373                       addr,
1374                       static_cast<SVEStore1Fn>(&Assembler::st1h));
1375 }
1376 
St1w(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1377 void MacroAssembler::St1w(const ZRegister& zt,
1378                           const PRegister& pg,
1379                           const SVEMemOperand& addr) {
1380   VIXL_ASSERT(allow_macro_instructions_);
1381   SVELoadStore1Helper(kSRegSizeInBytesLog2,
1382                       zt,
1383                       pg,
1384                       addr,
1385                       static_cast<SVEStore1Fn>(&Assembler::st1w));
1386 }
1387 
St1d(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1388 void MacroAssembler::St1d(const ZRegister& zt,
1389                           const PRegister& pg,
1390                           const SVEMemOperand& addr) {
1391   VIXL_ASSERT(allow_macro_instructions_);
1392   SVELoadStore1Helper(kDRegSizeInBytesLog2,
1393                       zt,
1394                       pg,
1395                       addr,
1396                       static_cast<SVEStore1Fn>(&Assembler::st1d));
1397 }
1398 
Ldff1b(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1399 void MacroAssembler::Ldff1b(const ZRegister& zt,
1400                             const PRegisterZ& pg,
1401                             const SVEMemOperand& addr) {
1402   VIXL_ASSERT(allow_macro_instructions_);
1403   SVELoadFFHelper(kBRegSizeInBytesLog2,
1404                   zt,
1405                   pg,
1406                   addr,
1407                   static_cast<SVELoad1Fn>(&Assembler::ldff1b));
1408 }
1409 
Ldff1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1410 void MacroAssembler::Ldff1h(const ZRegister& zt,
1411                             const PRegisterZ& pg,
1412                             const SVEMemOperand& addr) {
1413   VIXL_ASSERT(allow_macro_instructions_);
1414   SVELoadFFHelper(kHRegSizeInBytesLog2,
1415                   zt,
1416                   pg,
1417                   addr,
1418                   static_cast<SVELoad1Fn>(&Assembler::ldff1h));
1419 }
1420 
Ldff1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1421 void MacroAssembler::Ldff1w(const ZRegister& zt,
1422                             const PRegisterZ& pg,
1423                             const SVEMemOperand& addr) {
1424   VIXL_ASSERT(allow_macro_instructions_);
1425   SVELoadFFHelper(kSRegSizeInBytesLog2,
1426                   zt,
1427                   pg,
1428                   addr,
1429                   static_cast<SVELoad1Fn>(&Assembler::ldff1w));
1430 }
1431 
Ldff1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1432 void MacroAssembler::Ldff1d(const ZRegister& zt,
1433                             const PRegisterZ& pg,
1434                             const SVEMemOperand& addr) {
1435   VIXL_ASSERT(allow_macro_instructions_);
1436   SVELoadFFHelper(kDRegSizeInBytesLog2,
1437                   zt,
1438                   pg,
1439                   addr,
1440                   static_cast<SVELoad1Fn>(&Assembler::ldff1d));
1441 }
1442 
Ldff1sb(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1443 void MacroAssembler::Ldff1sb(const ZRegister& zt,
1444                              const PRegisterZ& pg,
1445                              const SVEMemOperand& addr) {
1446   VIXL_ASSERT(allow_macro_instructions_);
1447   SVELoadFFHelper(kBRegSizeInBytesLog2,
1448                   zt,
1449                   pg,
1450                   addr,
1451                   static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
1452 }
1453 
Ldff1sh(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1454 void MacroAssembler::Ldff1sh(const ZRegister& zt,
1455                              const PRegisterZ& pg,
1456                              const SVEMemOperand& addr) {
1457   VIXL_ASSERT(allow_macro_instructions_);
1458   SVELoadFFHelper(kHRegSizeInBytesLog2,
1459                   zt,
1460                   pg,
1461                   addr,
1462                   static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
1463 }
1464 
Ldff1sw(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1465 void MacroAssembler::Ldff1sw(const ZRegister& zt,
1466                              const PRegisterZ& pg,
1467                              const SVEMemOperand& addr) {
1468   VIXL_ASSERT(allow_macro_instructions_);
1469   SVELoadFFHelper(kSRegSizeInBytesLog2,
1470                   zt,
1471                   pg,
1472                   addr,
1473                   static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
1474 }
1475 
Ld1rqb(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1476 void MacroAssembler::Ld1rqb(const ZRegister& zt,
1477                             const PRegisterZ& pg,
1478                             const SVEMemOperand& addr) {
1479   VIXL_ASSERT(allow_macro_instructions_);
1480   SVELoadStoreScalarImmHelper(zt,
1481                               pg,
1482                               addr,
1483                               &MacroAssembler::ld1rqb,
1484                               4,
1485                               4,
1486                               NO_SVE_OFFSET_MODIFIER,
1487                               -1);
1488 }
1489 
Ld1rqd(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1490 void MacroAssembler::Ld1rqd(const ZRegister& zt,
1491                             const PRegisterZ& pg,
1492                             const SVEMemOperand& addr) {
1493   VIXL_ASSERT(allow_macro_instructions_);
1494   SVELoadStoreScalarImmHelper(zt,
1495                               pg,
1496                               addr,
1497                               &MacroAssembler::ld1rqd,
1498                               4,
1499                               4,
1500                               NO_SVE_OFFSET_MODIFIER,
1501                               -1);
1502 }
1503 
Ld1rqh(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1504 void MacroAssembler::Ld1rqh(const ZRegister& zt,
1505                             const PRegisterZ& pg,
1506                             const SVEMemOperand& addr) {
1507   VIXL_ASSERT(allow_macro_instructions_);
1508   SVELoadStoreScalarImmHelper(zt,
1509                               pg,
1510                               addr,
1511                               &MacroAssembler::ld1rqh,
1512                               4,
1513                               4,
1514                               NO_SVE_OFFSET_MODIFIER,
1515                               -1);
1516 }
1517 
Ld1rqw(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1518 void MacroAssembler::Ld1rqw(const ZRegister& zt,
1519                             const PRegisterZ& pg,
1520                             const SVEMemOperand& addr) {
1521   VIXL_ASSERT(allow_macro_instructions_);
1522   SVELoadStoreScalarImmHelper(zt,
1523                               pg,
1524                               addr,
1525                               &MacroAssembler::ld1rqw,
1526                               4,
1527                               4,
1528                               NO_SVE_OFFSET_MODIFIER,
1529                               -1);
1530 }
1531 
Ldnt1b(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1532 void MacroAssembler::Ldnt1b(const ZRegister& zt,
1533                             const PRegisterZ& pg,
1534                             const SVEMemOperand& addr) {
1535   VIXL_ASSERT(allow_macro_instructions_);
1536   SVELoadStoreScalarImmHelper(zt,
1537                               pg,
1538                               addr,
1539                               &MacroAssembler::ldnt1b,
1540                               4,
1541                               0,
1542                               SVE_MUL_VL);
1543 }
1544 
Ldnt1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1545 void MacroAssembler::Ldnt1d(const ZRegister& zt,
1546                             const PRegisterZ& pg,
1547                             const SVEMemOperand& addr) {
1548   VIXL_ASSERT(allow_macro_instructions_);
1549   SVELoadStoreScalarImmHelper(zt,
1550                               pg,
1551                               addr,
1552                               &MacroAssembler::ldnt1d,
1553                               4,
1554                               0,
1555                               SVE_MUL_VL);
1556 }
1557 
Ldnt1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1558 void MacroAssembler::Ldnt1h(const ZRegister& zt,
1559                             const PRegisterZ& pg,
1560                             const SVEMemOperand& addr) {
1561   VIXL_ASSERT(allow_macro_instructions_);
1562   SVELoadStoreScalarImmHelper(zt,
1563                               pg,
1564                               addr,
1565                               &MacroAssembler::ldnt1h,
1566                               4,
1567                               0,
1568                               SVE_MUL_VL);
1569 }
1570 
Ldnt1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1571 void MacroAssembler::Ldnt1w(const ZRegister& zt,
1572                             const PRegisterZ& pg,
1573                             const SVEMemOperand& addr) {
1574   VIXL_ASSERT(allow_macro_instructions_);
1575   SVELoadStoreScalarImmHelper(zt,
1576                               pg,
1577                               addr,
1578                               &MacroAssembler::ldnt1w,
1579                               4,
1580                               0,
1581                               SVE_MUL_VL);
1582 }
1583 
Stnt1b(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1584 void MacroAssembler::Stnt1b(const ZRegister& zt,
1585                             const PRegister& pg,
1586                             const SVEMemOperand& addr) {
1587   VIXL_ASSERT(allow_macro_instructions_);
1588   SVELoadStoreScalarImmHelper(zt,
1589                               pg,
1590                               addr,
1591                               &MacroAssembler::stnt1b,
1592                               4,
1593                               0,
1594                               SVE_MUL_VL);
1595 }
Stnt1d(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1596 void MacroAssembler::Stnt1d(const ZRegister& zt,
1597                             const PRegister& pg,
1598                             const SVEMemOperand& addr) {
1599   VIXL_ASSERT(allow_macro_instructions_);
1600   SVELoadStoreScalarImmHelper(zt,
1601                               pg,
1602                               addr,
1603                               &MacroAssembler::stnt1d,
1604                               4,
1605                               0,
1606                               SVE_MUL_VL);
1607 }
Stnt1h(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1608 void MacroAssembler::Stnt1h(const ZRegister& zt,
1609                             const PRegister& pg,
1610                             const SVEMemOperand& addr) {
1611   VIXL_ASSERT(allow_macro_instructions_);
1612   SVELoadStoreScalarImmHelper(zt,
1613                               pg,
1614                               addr,
1615                               &MacroAssembler::stnt1h,
1616                               4,
1617                               0,
1618                               SVE_MUL_VL);
1619 }
Stnt1w(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1620 void MacroAssembler::Stnt1w(const ZRegister& zt,
1621                             const PRegister& pg,
1622                             const SVEMemOperand& addr) {
1623   VIXL_ASSERT(allow_macro_instructions_);
1624   SVELoadStoreScalarImmHelper(zt,
1625                               pg,
1626                               addr,
1627                               &MacroAssembler::stnt1w,
1628                               4,
1629                               0,
1630                               SVE_MUL_VL);
1631 }
1632 
SVESdotUdotIndexHelper(IntArithIndexFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1633 void MacroAssembler::SVESdotUdotIndexHelper(IntArithIndexFn fn,
1634                                             const ZRegister& zd,
1635                                             const ZRegister& za,
1636                                             const ZRegister& zn,
1637                                             const ZRegister& zm,
1638                                             int index) {
1639   if (zd.Aliases(za)) {
1640     // zda = zda + (zn . zm)
1641     SingleEmissionCheckScope guard(this);
1642     (this->*fn)(zd, zn, zm, index);
1643 
1644   } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1645     // zdn = za + (zdn . zm[index])
1646     // zdm = za + (zn . zdm[index])
1647     // zdnm = za + (zdnm . zdnm[index])
1648     UseScratchRegisterScope temps(this);
1649     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1650     {
1651       MovprfxHelperScope guard(this, scratch, za);
1652       (this->*fn)(scratch, zn, zm, index);
1653     }
1654 
1655     Mov(zd, scratch);
1656   } else {
1657     // zd = za + (zn . zm)
1658     MovprfxHelperScope guard(this, zd, za);
1659     (this->*fn)(zd, zn, zm, index);
1660   }
1661 }
1662 
SVESdotUdotHelper(IntArithFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1663 void MacroAssembler::SVESdotUdotHelper(IntArithFn fn,
1664                                        const ZRegister& zd,
1665                                        const ZRegister& za,
1666                                        const ZRegister& zn,
1667                                        const ZRegister& zm) {
1668   if (zd.Aliases(za)) {
1669     // zda = zda + (zn . zm)
1670     SingleEmissionCheckScope guard(this);
1671     (this->*fn)(zd, zn, zm);
1672 
1673   } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1674     // zdn = za + (zdn . zm)
1675     // zdm = za + (zn . zdm)
1676     // zdnm = za + (zdnm . zdnm)
1677     UseScratchRegisterScope temps(this);
1678     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1679     {
1680       MovprfxHelperScope guard(this, scratch, za);
1681       (this->*fn)(scratch, zn, zm);
1682     }
1683 
1684     Mov(zd, scratch);
1685   } else {
1686     // zd = za + (zn . zm)
1687     MovprfxHelperScope guard(this, zd, za);
1688     (this->*fn)(zd, zn, zm);
1689   }
1690 }
1691 
Fscale(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)1692 void MacroAssembler::Fscale(const ZRegister& zd,
1693                             const PRegisterM& pg,
1694                             const ZRegister& zn,
1695                             const ZRegister& zm) {
1696   VIXL_ASSERT(allow_macro_instructions_);
1697   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1698     UseScratchRegisterScope temps(this);
1699     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
1700     Mov(scratch, zm);
1701     MovprfxHelperScope guard(this, zd, pg, zn);
1702     fscale(zd, pg, zd, scratch);
1703   } else {
1704     MovprfxHelperScope guard(this, zd, pg, zn);
1705     fscale(zd, pg, zd, zm);
1706   }
1707 }
1708 
Sdot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1709 void MacroAssembler::Sdot(const ZRegister& zd,
1710                           const ZRegister& za,
1711                           const ZRegister& zn,
1712                           const ZRegister& zm) {
1713   VIXL_ASSERT(allow_macro_instructions_);
1714   SVESdotUdotHelper(&Assembler::sdot, zd, za, zn, zm);
1715 }
1716 
Sdot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1717 void MacroAssembler::Sdot(const ZRegister& zd,
1718                           const ZRegister& za,
1719                           const ZRegister& zn,
1720                           const ZRegister& zm,
1721                           int index) {
1722   VIXL_ASSERT(allow_macro_instructions_);
1723   SVESdotUdotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
1724 }
1725 
Udot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1726 void MacroAssembler::Udot(const ZRegister& zd,
1727                           const ZRegister& za,
1728                           const ZRegister& zn,
1729                           const ZRegister& zm) {
1730   VIXL_ASSERT(allow_macro_instructions_);
1731   SVESdotUdotHelper(&Assembler::udot, zd, za, zn, zm);
1732 }
1733 
Udot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1734 void MacroAssembler::Udot(const ZRegister& zd,
1735                           const ZRegister& za,
1736                           const ZRegister& zn,
1737                           const ZRegister& zm,
1738                           int index) {
1739   VIXL_ASSERT(allow_macro_instructions_);
1740   SVESdotUdotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
1741 }
1742 
FPMulAddHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,SVEMulAddPredicatedZdaFn fn_zda,SVEMulAddPredicatedZdnFn fn_zdn,FPMacroNaNPropagationOption nan_option)1743 void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
1744                                     const PRegisterM& pg,
1745                                     const ZRegister& za,
1746                                     const ZRegister& zn,
1747                                     const ZRegister& zm,
1748                                     SVEMulAddPredicatedZdaFn fn_zda,
1749                                     SVEMulAddPredicatedZdnFn fn_zdn,
1750                                     FPMacroNaNPropagationOption nan_option) {
1751   ResolveFPNaNPropagationOption(&nan_option);
1752 
1753   if (zd.Aliases(za)) {
1754     // zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
1755     SingleEmissionCheckScope guard(this);
1756     (this->*fn_zda)(zd, pg, zn, zm);
1757   } else if (zd.Aliases(zn)) {
1758     // zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
1759     SingleEmissionCheckScope guard(this);
1760     (this->*fn_zdn)(zd, pg, zm, za);
1761   } else if (zd.Aliases(zm)) {
1762     switch (nan_option) {
1763       case FastNaNPropagation: {
1764         // We treat multiplication as commutative in the fast mode, so we can
1765         // swap zn and zm.
1766         // zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
1767         SingleEmissionCheckScope guard(this);
1768         (this->*fn_zdn)(zd, pg, zn, za);
1769         return;
1770       }
1771       case StrictNaNPropagation: {
1772         UseScratchRegisterScope temps(this);
1773         // Use a scratch register to keep the argument order exactly as
1774         // specified.
1775         ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
1776         {
1777           MovprfxHelperScope guard(this, scratch, pg, za);
1778           // scratch = (-)za + ((-)zn * zm)
1779           (this->*fn_zda)(scratch, pg, zn, zm);
1780         }
1781         Mov(zd, scratch);
1782         return;
1783       }
1784       case NoFPMacroNaNPropagationSelected:
1785         VIXL_UNREACHABLE();
1786         return;
1787     }
1788   } else {
1789     // zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
1790     MovprfxHelperScope guard(this, zd, pg, za);
1791     (this->*fn_zda)(zd, pg, zn, zm);
1792   }
1793 }
1794 
FPMulAddIndexHelper(SVEMulAddIndexFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1795 void MacroAssembler::FPMulAddIndexHelper(SVEMulAddIndexFn fn,
1796                                          const ZRegister& zd,
1797                                          const ZRegister& za,
1798                                          const ZRegister& zn,
1799                                          const ZRegister& zm,
1800                                          int index) {
1801   if (zd.Aliases(za)) {
1802     // zda = zda + (zn * zm[i])
1803     SingleEmissionCheckScope guard(this);
1804     (this->*fn)(zd, zn, zm, index);
1805 
1806   } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1807     // zdn = za + (zdn * zm[i])
1808     // zdm = za + (zn * zdm[i])
1809     // zdnm = za + (zdnm * zdnm[i])
1810     UseScratchRegisterScope temps(this);
1811     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1812     {
1813       MovprfxHelperScope guard(this, scratch, za);
1814       (this->*fn)(scratch, zn, zm, index);
1815     }
1816     Mov(zd, scratch);
1817   } else {
1818     // zd = za + (zn * zm[i])
1819     MovprfxHelperScope guard(this, zd, za);
1820     (this->*fn)(zd, zn, zm, index);
1821   }
1822 }
1823 
Fmla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)1824 void MacroAssembler::Fmla(const ZRegister& zd,
1825                           const PRegisterM& pg,
1826                           const ZRegister& za,
1827                           const ZRegister& zn,
1828                           const ZRegister& zm,
1829                           FPMacroNaNPropagationOption nan_option) {
1830   VIXL_ASSERT(allow_macro_instructions_);
1831   FPMulAddHelper(zd,
1832                  pg,
1833                  za,
1834                  zn,
1835                  zm,
1836                  &Assembler::fmla,
1837                  &Assembler::fmad,
1838                  nan_option);
1839 }
1840 
Fmla(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1841 void MacroAssembler::Fmla(const ZRegister& zd,
1842                           const ZRegister& za,
1843                           const ZRegister& zn,
1844                           const ZRegister& zm,
1845                           int index) {
1846   VIXL_ASSERT(allow_macro_instructions_);
1847   FPMulAddIndexHelper(&Assembler::fmla, zd, za, zn, zm, index);
1848 }
1849 
Fmls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)1850 void MacroAssembler::Fmls(const ZRegister& zd,
1851                           const PRegisterM& pg,
1852                           const ZRegister& za,
1853                           const ZRegister& zn,
1854                           const ZRegister& zm,
1855                           FPMacroNaNPropagationOption nan_option) {
1856   VIXL_ASSERT(allow_macro_instructions_);
1857   FPMulAddHelper(zd,
1858                  pg,
1859                  za,
1860                  zn,
1861                  zm,
1862                  &Assembler::fmls,
1863                  &Assembler::fmsb,
1864                  nan_option);
1865 }
1866 
Fmls(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1867 void MacroAssembler::Fmls(const ZRegister& zd,
1868                           const ZRegister& za,
1869                           const ZRegister& zn,
1870                           const ZRegister& zm,
1871                           int index) {
1872   VIXL_ASSERT(allow_macro_instructions_);
1873   FPMulAddIndexHelper(&Assembler::fmls, zd, za, zn, zm, index);
1874 }
1875 
Fnmla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)1876 void MacroAssembler::Fnmla(const ZRegister& zd,
1877                            const PRegisterM& pg,
1878                            const ZRegister& za,
1879                            const ZRegister& zn,
1880                            const ZRegister& zm,
1881                            FPMacroNaNPropagationOption nan_option) {
1882   VIXL_ASSERT(allow_macro_instructions_);
1883   FPMulAddHelper(zd,
1884                  pg,
1885                  za,
1886                  zn,
1887                  zm,
1888                  &Assembler::fnmla,
1889                  &Assembler::fnmad,
1890                  nan_option);
1891 }
1892 
Fnmls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)1893 void MacroAssembler::Fnmls(const ZRegister& zd,
1894                            const PRegisterM& pg,
1895                            const ZRegister& za,
1896                            const ZRegister& zn,
1897                            const ZRegister& zm,
1898                            FPMacroNaNPropagationOption nan_option) {
1899   VIXL_ASSERT(allow_macro_instructions_);
1900   FPMulAddHelper(zd,
1901                  pg,
1902                  za,
1903                  zn,
1904                  zm,
1905                  &Assembler::fnmls,
1906                  &Assembler::fnmsb,
1907                  nan_option);
1908 }
1909 
Ftmad(const ZRegister & zd,const ZRegister & zn,const ZRegister & zm,int imm3)1910 void MacroAssembler::Ftmad(const ZRegister& zd,
1911                            const ZRegister& zn,
1912                            const ZRegister& zm,
1913                            int imm3) {
1914   VIXL_ASSERT(allow_macro_instructions_);
1915   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1916     UseScratchRegisterScope temps(this);
1917     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
1918     Mov(scratch, zm);
1919     MovprfxHelperScope guard(this, zd, zn);
1920     ftmad(zd, zd, scratch, imm3);
1921   } else {
1922     MovprfxHelperScope guard(this, zd, zn);
1923     ftmad(zd, zd, zm, imm3);
1924   }
1925 }
1926 
Fcadd(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,int rot)1927 void MacroAssembler::Fcadd(const ZRegister& zd,
1928                            const PRegisterM& pg,
1929                            const ZRegister& zn,
1930                            const ZRegister& zm,
1931                            int rot) {
1932   VIXL_ASSERT(allow_macro_instructions_);
1933   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1934     UseScratchRegisterScope temps(this);
1935     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1936     {
1937       MovprfxHelperScope guard(this, scratch, pg, zn);
1938       fcadd(scratch, pg, scratch, zm, rot);
1939     }
1940     Mov(zd, scratch);
1941   } else {
1942     MovprfxHelperScope guard(this, zd, pg, zn);
1943     fcadd(zd, pg, zd, zm, rot);
1944   }
1945 }
1946 
Ext(const ZRegister & zd,const ZRegister & zn,const ZRegister & zm,unsigned offset)1947 void MacroAssembler::Ext(const ZRegister& zd,
1948                          const ZRegister& zn,
1949                          const ZRegister& zm,
1950                          unsigned offset) {
1951   VIXL_ASSERT(allow_macro_instructions_);
1952   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1953     // zd = ext(zn, zd, offset)
1954     UseScratchRegisterScope temps(this);
1955     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1956     {
1957       MovprfxHelperScope guard(this, scratch, zn);
1958       ext(scratch, scratch, zm, offset);
1959     }
1960     Mov(zd, scratch);
1961   } else {
1962     // zd = ext(zn, zm, offset)
1963     // zd = ext(zd, zd, offset)
1964     MovprfxHelperScope guard(this, zd, zn);
1965     ext(zd, zd, zm, offset);
1966   }
1967 }
1968 
Splice(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)1969 void MacroAssembler::Splice(const ZRegister& zd,
1970                             const PRegister& pg,
1971                             const ZRegister& zn,
1972                             const ZRegister& zm) {
1973   VIXL_ASSERT(allow_macro_instructions_);
1974   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1975     UseScratchRegisterScope temps(this);
1976     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1977     {
1978       MovprfxHelperScope guard(this, scratch, zn);
1979       splice(scratch, pg, scratch, zm);
1980     }
1981     Mov(zd, scratch);
1982   } else {
1983     MovprfxHelperScope guard(this, zd, zn);
1984     splice(zd, pg, zd, zm);
1985   }
1986 }
1987 
Clasta(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)1988 void MacroAssembler::Clasta(const ZRegister& zd,
1989                             const PRegister& pg,
1990                             const ZRegister& zn,
1991                             const ZRegister& zm) {
1992   VIXL_ASSERT(allow_macro_instructions_);
1993   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1994     UseScratchRegisterScope temps(this);
1995     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1996     {
1997       MovprfxHelperScope guard(this, scratch, zn);
1998       clasta(scratch, pg, scratch, zm);
1999     }
2000     Mov(zd, scratch);
2001   } else {
2002     MovprfxHelperScope guard(this, zd, zn);
2003     clasta(zd, pg, zd, zm);
2004   }
2005 }
2006 
Clastb(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)2007 void MacroAssembler::Clastb(const ZRegister& zd,
2008                             const PRegister& pg,
2009                             const ZRegister& zn,
2010                             const ZRegister& zm) {
2011   VIXL_ASSERT(allow_macro_instructions_);
2012   if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2013     UseScratchRegisterScope temps(this);
2014     ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2015     {
2016       MovprfxHelperScope guard(this, scratch, zn);
2017       clastb(scratch, pg, scratch, zm);
2018     }
2019     Mov(zd, scratch);
2020   } else {
2021     MovprfxHelperScope guard(this, zd, zn);
2022     clastb(zd, pg, zd, zm);
2023   }
2024 }
2025 
2026 }  // namespace aarch64
2027 }  // namespace vixl
2028