1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
2 //
3 //                        The Subzero Code Generator
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Implements the TargetLoweringX86Base class, which consists almost
12 /// entirely of the lowering sequence for each high-level instruction.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
18 
19 #include "IceCfg.h"
20 #include "IceCfgNode.h"
21 #include "IceClFlags.h"
22 #include "IceDefs.h"
23 #include "IceELFObjectWriter.h"
24 #include "IceGlobalInits.h"
25 #include "IceInstVarIter.h"
26 #include "IceInstX86Base.h"
27 #include "IceLiveness.h"
28 #include "IceOperand.h"
29 #include "IcePhiLoweringImpl.h"
30 #include "IceUtils.h"
31 #include "IceVariableSplitting.h"
32 
33 #include "llvm/Support/MathExtras.h"
34 
35 #include <stack>
36 
37 namespace Ice {
38 namespace X86 {
39 template <typename T> struct PoolTypeConverter {};
40 
41 template <> struct PoolTypeConverter<float> {
42   using PrimitiveIntType = uint32_t;
43   using IceType = ConstantFloat;
44   static const Type Ty = IceType_f32;
45   static const char *TypeName;
46   static const char *AsmTag;
47   static const char *PrintfString;
48 };
49 
50 template <> struct PoolTypeConverter<double> {
51   using PrimitiveIntType = uint64_t;
52   using IceType = ConstantDouble;
53   static const Type Ty = IceType_f64;
54   static const char *TypeName;
55   static const char *AsmTag;
56   static const char *PrintfString;
57 };
58 
59 // Add converter for int type constant pooling
60 template <> struct PoolTypeConverter<uint32_t> {
61   using PrimitiveIntType = uint32_t;
62   using IceType = ConstantInteger32;
63   static const Type Ty = IceType_i32;
64   static const char *TypeName;
65   static const char *AsmTag;
66   static const char *PrintfString;
67 };
68 
69 // Add converter for int type constant pooling
70 template <> struct PoolTypeConverter<uint16_t> {
71   using PrimitiveIntType = uint32_t;
72   using IceType = ConstantInteger32;
73   static const Type Ty = IceType_i16;
74   static const char *TypeName;
75   static const char *AsmTag;
76   static const char *PrintfString;
77 };
78 
79 // Add converter for int type constant pooling
80 template <> struct PoolTypeConverter<uint8_t> {
81   using PrimitiveIntType = uint32_t;
82   using IceType = ConstantInteger32;
83   static const Type Ty = IceType_i8;
84   static const char *TypeName;
85   static const char *AsmTag;
86   static const char *PrintfString;
87 };
88 } // end of namespace X86
89 
90 namespace X86NAMESPACE {
91 
92 // The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
93 // "shadow store" (aka "home space") so that the callee may copy the 4
94 // register args to it.
95 template <typename Traits> SizeT getShadowStoreSize() {
96 #if defined(SUBZERO_USE_MICROSOFT_ABI)
97   static const SizeT ShadowStoreSize =
98       Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0;
99   return ShadowStoreSize;
100 #else
101   return 0;
102 #endif
103 }
104 
105 using Utils::BoolFlagSaver;
106 
107 template <typename Traits> class BoolFoldingEntry {
108   BoolFoldingEntry(const BoolFoldingEntry &) = delete;
109 
110 public:
111   BoolFoldingEntry() = default;
112   explicit BoolFoldingEntry(Inst *I);
113   BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
114   /// Instr is the instruction producing the i1-type variable of interest.
115   Inst *Instr = nullptr;
116   /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
117   bool IsComplex = false;
118   /// IsLiveOut is initialized conservatively to true, and is set to false when
119   /// we encounter an instruction that ends Var's live range. We disable the
120   /// folding optimization when Var is live beyond this basic block. Note that
121   /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
122   /// always be true and the folding optimization will never be performed.
123   bool IsLiveOut = true;
124   // NumUses counts the number of times Var is used as a source operand in the
125   // basic block. If IsComplex is true and there is more than one use of Var,
126   // then the folding optimization is disabled for Var.
127   uint32_t NumUses = 0;
128 };
129 
130 template <typename Traits> class BoolFolding {
131 public:
132   enum BoolFoldingProducerKind {
133     PK_None,
134     // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
135     PK_Icmp32,
136     PK_Icmp64,
137     PK_Fcmp,
138     PK_Trunc,
139     PK_Arith // A flag-setting arithmetic instruction.
140   };
141 
142   /// Currently the actual enum values are not used (other than CK_None), but we
143   /// go ahead and produce them anyway for symmetry with the
144   /// BoolFoldingProducerKind.
145   enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
146 
147 private:
148   BoolFolding(const BoolFolding &) = delete;
149   BoolFolding &operator=(const BoolFolding &) = delete;
150 
151 public:
152   BoolFolding() = default;
153   static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
154   static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
155   static bool hasComplexLowering(const Inst *Instr);
156   static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
157                              BoolFoldingConsumerKind ConsumerKind);
158   void init(CfgNode *Node);
159   const Inst *getProducerFor(const Operand *Opnd) const;
160   void dump(const Cfg *Func) const;
161 
162 private:
163   /// Returns true if Producers contains a valid entry for the given VarNum.
164   bool containsValid(SizeT VarNum) const {
165     auto Element = Producers.find(VarNum);
166     return Element != Producers.end() && Element->second.Instr != nullptr;
167   }
168   void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
169   void invalidateProducersOnStore(const Inst *Instr);
170   /// Producers maps Variable::Number to a BoolFoldingEntry.
171   CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers;
172 };
173 
174 template <typename Traits>
175 BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I)
176     : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {}
177 
178 template <typename Traits>
179 typename BoolFolding<Traits>::BoolFoldingProducerKind
180 BoolFolding<Traits>::getProducerKind(const Inst *Instr) {
181   if (llvm::isa<InstIcmp>(Instr)) {
182     if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
183       return PK_Icmp32;
184     return PK_Icmp64;
185   }
186   if (llvm::isa<InstFcmp>(Instr))
187     return PK_Fcmp;
188   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
189     if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
190       switch (Arith->getOp()) {
191       default:
192         return PK_None;
193       case InstArithmetic::And:
194       case InstArithmetic::Or:
195         return PK_Arith;
196       }
197     }
198   }
199   return PK_None; // TODO(stichnot): remove this
200 
201   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
202     switch (Cast->getCastKind()) {
203     default:
204       return PK_None;
205     case InstCast::Trunc:
206       return PK_Trunc;
207     }
208   }
209   return PK_None;
210 }
211 
212 template <typename Traits>
213 typename BoolFolding<Traits>::BoolFoldingConsumerKind
214 BoolFolding<Traits>::getConsumerKind(const Inst *Instr) {
215   if (llvm::isa<InstBr>(Instr))
216     return CK_Br;
217   if (llvm::isa<InstSelect>(Instr))
218     return CK_Select;
219   return CK_None; // TODO(stichnot): remove this
220 
221   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
222     switch (Cast->getCastKind()) {
223     default:
224       return CK_None;
225     case InstCast::Sext:
226       return CK_Sext;
227     case InstCast::Zext:
228       return CK_Zext;
229     }
230   }
231   return CK_None;
232 }
233 
234 /// Returns true if the producing instruction has a "complex" lowering sequence.
235 /// This generally means that its lowering sequence requires more than one
236 /// conditional branch, namely 64-bit integer compares and some floating-point
237 /// compares. When this is true, and there is more than one consumer, we prefer
238 /// to disable the folding optimization because it minimizes branches.
239 template <typename Traits>
240 bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) {
241   switch (getProducerKind(Instr)) {
242   default:
243     return false;
244   case PK_Icmp64:
245     return !Traits::Is64Bit;
246   case PK_Fcmp:
247     return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
248            Traits::Cond::Br_None;
249   }
250 }
251 
252 template <typename Traits>
253 bool BoolFolding<Traits>::isValidFolding(
254     typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind,
255     typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) {
256   switch (ProducerKind) {
257   default:
258     return false;
259   case PK_Icmp32:
260   case PK_Icmp64:
261   case PK_Fcmp:
262     return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
263   case PK_Arith:
264     return ConsumerKind == CK_Br;
265   }
266 }
267 
268 template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) {
269   Producers.clear();
270   for (Inst &Instr : Node->getInsts()) {
271     if (Instr.isDeleted())
272       continue;
273     invalidateProducersOnStore(&Instr);
274     // Check whether Instr is a valid producer.
275     Variable *Var = Instr.getDest();
276     if (Var) { // only consider instructions with an actual dest var
277       if (isBooleanType(Var->getType())) {        // only bool-type dest vars
278         if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
279           Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr);
280         }
281       }
282     }
283     // Check each src variable against the map.
284     FOREACH_VAR_IN_INST(Var, Instr) {
285       SizeT VarNum = Var->getIndex();
286       if (!containsValid(VarNum))
287         continue;
288       // All valid consumers use Var as the first source operand
289       if (IndexOfVarOperandInInst(Var) != 0) {
290         setInvalid(VarNum);
291         continue;
292       }
293       // Consumer instructions must be white-listed
294       typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind =
295           getConsumerKind(&Instr);
296       if (ConsumerKind == CK_None) {
297         setInvalid(VarNum);
298         continue;
299       }
300       typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind =
301           getProducerKind(Producers[VarNum].Instr);
302       if (!isValidFolding(ProducerKind, ConsumerKind)) {
303         setInvalid(VarNum);
304         continue;
305       }
306       // Avoid creating multiple copies of complex producer instructions.
307       if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
308         setInvalid(VarNum);
309         continue;
310       }
311       ++Producers[VarNum].NumUses;
312       if (Instr.isLastUse(Var)) {
313         Producers[VarNum].IsLiveOut = false;
314       }
315     }
316   }
317   for (auto &I : Producers) {
318     // Ignore entries previously marked invalid.
319     if (I.second.Instr == nullptr)
320       continue;
321     // Disable the producer if its dest may be live beyond this block.
322     if (I.second.IsLiveOut) {
323       setInvalid(I.first);
324       continue;
325     }
326     // Mark as "dead" rather than outright deleting. This is so that other
327     // peephole style optimizations during or before lowering have access to
328     // this instruction in undeleted form. See for example
329     // tryOptimizedCmpxchgCmpBr().
330     I.second.Instr->setDead();
331   }
332 }
333 
334 template <typename Traits>
335 const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const {
336   auto *Var = llvm::dyn_cast<const Variable>(Opnd);
337   if (Var == nullptr)
338     return nullptr;
339   SizeT VarNum = Var->getIndex();
340   auto Element = Producers.find(VarNum);
341   if (Element == Producers.end())
342     return nullptr;
343   return Element->second.Instr;
344 }
345 
346 template <typename Traits>
347 void BoolFolding<Traits>::dump(const Cfg *Func) const {
348   if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
349     return;
350   OstreamLocker L(Func->getContext());
351   Ostream &Str = Func->getContext()->getStrDump();
352   for (auto &I : Producers) {
353     if (I.second.Instr == nullptr)
354       continue;
355     Str << "Found foldable producer:\n  ";
356     I.second.Instr->dump(Func);
357     Str << "\n";
358   }
359 }
360 
361 /// If the given instruction has potential memory side effects (e.g. store, rmw,
362 /// or a call instruction with potential memory side effects), then we must not
363 /// allow a pre-store Producer instruction with memory operands to be folded
364 /// into a post-store Consumer instruction.  If this is detected, the Producer
365 /// is invalidated.
366 ///
367 /// We use the Producer's IsLiveOut field to determine whether any potential
368 /// Consumers come after this store instruction.  The IsLiveOut field is
369 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
370 /// sees the variable's definitive last use (indicating the variable is not in
371 /// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
372 /// know that there can be no consumers after the store, and therefore we know
373 /// the folding is safe despite the store instruction.
374 template <typename Traits>
375 void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) {
376   if (!Instr->isMemoryWrite())
377     return;
378   for (auto &ProducerPair : Producers) {
379     if (!ProducerPair.second.IsLiveOut)
380       continue;
381     Inst *PInst = ProducerPair.second.Instr;
382     if (PInst == nullptr)
383       continue;
384     bool HasMemOperand = false;
385     const SizeT SrcSize = PInst->getSrcSize();
386     for (SizeT I = 0; I < SrcSize; ++I) {
387       if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
388         HasMemOperand = true;
389         break;
390       }
391     }
392     if (!HasMemOperand)
393       continue;
394     setInvalid(ProducerPair.first);
395   }
396 }
397 
398 template <typename TraitsType>
399 void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) {
400   FoldingInfo.init(Node);
401   FoldingInfo.dump(Func);
402 }
403 
404 template <typename TraitsType>
405 TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func)
406     : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl) {
407   static_assert(
408       (Traits::InstructionSet::End - Traits::InstructionSet::Begin) ==
409           (TargetInstructionSet::X86InstructionSet_End -
410            TargetInstructionSet::X86InstructionSet_Begin),
411       "Traits::InstructionSet range different from TargetInstructionSet");
412   if (getFlags().getTargetInstructionSet() !=
413       TargetInstructionSet::BaseInstructionSet) {
414     InstructionSet = static_cast<InstructionSetEnum>(
415         (getFlags().getTargetInstructionSet() -
416          TargetInstructionSet::X86InstructionSet_Begin) +
417         Traits::InstructionSet::Begin);
418   }
419 }
420 
421 template <typename TraitsType>
422 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
423   RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
424   Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
425   for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
426     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
427   filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
428                           TypeToRegisterSet.data(), TypeToRegisterSet.size(),
429                           Traits::getRegName, getRegClassName);
430   PcRelFixup = Traits::FK_PcRel;
431   AbsFixup = getFlags().getUseNonsfi() ? Traits::FK_Gotoff : Traits::FK_Abs;
432 }
433 
434 template <typename TraitsType>
435 bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) {
436   if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
437     return !Utils::isPositiveZero(ConstFloat->getValue());
438   }
439   if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
440     return !Utils::isPositiveZero(ConstDouble->getValue());
441   }
442   return false;
443 }
444 
445 template <typename TraitsType>
446 ::Ice::Type TargetX86Base<TraitsType>::getPointerType() {
447   if (!Traits::Is64Bit ||
448       ::Ice::getFlags().getApplicationBinaryInterface() == ::Ice::ABI_PNaCl) {
449     return ::Ice::IceType_i32;
450   }
451   return ::Ice::IceType_i64;
452 }
453 
454 template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
455   TimerMarker T(TimerStack::TT_O2, Func);
456 
457   if (SandboxingType != ST_None) {
458     initRebasePtr();
459   }
460 
461   genTargetHelperCalls();
462   Func->dump("After target helper call insertion");
463 
464   // Merge Alloca instructions, and lay out the stack.
465   static constexpr bool SortAndCombineAllocas = true;
466   Func->processAllocas(SortAndCombineAllocas);
467   Func->dump("After Alloca processing");
468 
469   // Run this early so it can be used to focus optimizations on potentially hot
470   // code.
471   // TODO(stichnot,ascull): currently only used for regalloc not
472   // expensive high level optimizations which could be focused on potentially
473   // hot code.
474   Func->generateLoopInfo();
475   Func->dump("After loop analysis");
476   if (getFlags().getLoopInvariantCodeMotion()) {
477     Func->loopInvariantCodeMotion();
478     Func->dump("After LICM");
479   }
480 
481   if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
482     Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
483     Func->dump("After Local CSE");
484     Func->floatConstantCSE();
485   }
486   if (getFlags().getEnableShortCircuit()) {
487     Func->shortCircuitJumps();
488     Func->dump("After Short Circuiting");
489   }
490 
491   if (!getFlags().getEnablePhiEdgeSplit()) {
492     // Lower Phi instructions.
493     Func->placePhiLoads();
494     if (Func->hasError())
495       return;
496     Func->placePhiStores();
497     if (Func->hasError())
498       return;
499     Func->deletePhis();
500     if (Func->hasError())
501       return;
502     Func->dump("After Phi lowering");
503   }
504 
505   // Address mode optimization.
506   Func->getVMetadata()->init(VMK_SingleDefs);
507   Func->doAddressOpt();
508   Func->materializeVectorShuffles();
509 
510   // Find read-modify-write opportunities. Do this after address mode
511   // optimization so that doAddressOpt() doesn't need to be applied to RMW
512   // instructions as well.
513   findRMW();
514   Func->dump("After RMW transform");
515 
516   // Argument lowering
517   Func->doArgLowering();
518 
519   // Target lowering. This requires liveness analysis for some parts of the
520   // lowering decisions, such as compare/branch fusing. If non-lightweight
521   // liveness analysis is used, the instructions need to be renumbered first
522   // TODO: This renumbering should only be necessary if we're actually
523   // calculating live intervals, which we only do for register allocation.
524   Func->renumberInstructions();
525   if (Func->hasError())
526     return;
527 
528   // TODO: It should be sufficient to use the fastest liveness calculation,
529   // i.e. livenessLightweight(). However, for some reason that slows down the
530   // rest of the translation. Investigate.
531   Func->liveness(Liveness_Basic);
532   if (Func->hasError())
533     return;
534   Func->dump("After x86 address mode opt");
535 
536   doLoadOpt();
537 
538   Func->genCode();
539   if (Func->hasError())
540     return;
541   if (SandboxingType != ST_None) {
542     initSandbox();
543   }
544   Func->dump("After x86 codegen");
545   splitBlockLocalVariables(Func);
546 
547   // Register allocation. This requires instruction renumbering and full
548   // liveness analysis. Loops must be identified before liveness so variable
549   // use weights are correct.
550   Func->renumberInstructions();
551   if (Func->hasError())
552     return;
553   Func->liveness(Liveness_Intervals);
554   if (Func->hasError())
555     return;
556   // The post-codegen dump is done here, after liveness analysis and associated
557   // cleanup, to make the dump cleaner and more useful.
558   Func->dump("After initial x86 codegen");
559   // Validate the live range computations. The expensive validation call is
560   // deliberately only made when assertions are enabled.
561   assert(Func->validateLiveness());
562   Func->getVMetadata()->init(VMK_All);
563   regAlloc(RAK_Global);
564   if (Func->hasError())
565     return;
566   Func->dump("After linear scan regalloc");
567 
568   if (getFlags().getEnablePhiEdgeSplit()) {
569     Func->advancedPhiLowering();
570     Func->dump("After advanced Phi lowering");
571   }
572 
573   // Stack frame mapping.
574   Func->genFrame();
575   if (Func->hasError())
576     return;
577   Func->dump("After stack frame mapping");
578 
579   Func->contractEmptyNodes();
580   Func->reorderNodes();
581 
582   // Branch optimization.  This needs to be done just before code emission. In
583   // particular, no transformations that insert or reorder CfgNodes should be
584   // done after branch optimization. We go ahead and do it before nop insertion
585   // to reduce the amount of work needed for searching for opportunities.
586   Func->doBranchOpt();
587   Func->dump("After branch optimization");
588 
589   // Mark nodes that require sandbox alignment
590   if (NeedSandboxing) {
591     Func->markNodesForSandboxing();
592   }
593 }
594 
595 template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() {
596   TimerMarker T(TimerStack::TT_Om1, Func);
597 
598   if (SandboxingType != ST_None) {
599     initRebasePtr();
600   }
601 
602   genTargetHelperCalls();
603 
604   // Do not merge Alloca instructions, and lay out the stack.
605   // static constexpr bool SortAndCombineAllocas = false;
606   static constexpr bool SortAndCombineAllocas =
607       true; // TODO(b/171222930): Fix Win32 bug when this is false
608   Func->processAllocas(SortAndCombineAllocas);
609   Func->dump("After Alloca processing");
610 
611   Func->placePhiLoads();
612   if (Func->hasError())
613     return;
614   Func->placePhiStores();
615   if (Func->hasError())
616     return;
617   Func->deletePhis();
618   if (Func->hasError())
619     return;
620   Func->dump("After Phi lowering");
621 
622   Func->doArgLowering();
623   Func->genCode();
624   if (Func->hasError())
625     return;
626   if (SandboxingType != ST_None) {
627     initSandbox();
628   }
629   Func->dump("After initial x86 codegen");
630 
631   regAlloc(RAK_InfOnly);
632   if (Func->hasError())
633     return;
634   Func->dump("After regalloc of infinite-weight variables");
635 
636   Func->genFrame();
637   if (Func->hasError())
638     return;
639   Func->dump("After stack frame mapping");
640 
641   // Mark nodes that require sandbox alignment
642   if (NeedSandboxing)
643     Func->markNodesForSandboxing();
644 }
645 
646 inline bool canRMW(const InstArithmetic *Arith) {
647   Type Ty = Arith->getDest()->getType();
648   // X86 vector instructions write to a register and have no RMW option.
649   if (isVectorType(Ty))
650     return false;
651   bool isI64 = Ty == IceType_i64;
652 
653   switch (Arith->getOp()) {
654   // Not handled for lack of simple lowering:
655   //   shift on i64
656   //   mul, udiv, urem, sdiv, srem, frem
657   // Not handled for lack of RMW instructions:
658   //   fadd, fsub, fmul, fdiv (also vector types)
659   default:
660     return false;
661   case InstArithmetic::Add:
662   case InstArithmetic::Sub:
663   case InstArithmetic::And:
664   case InstArithmetic::Or:
665   case InstArithmetic::Xor:
666     return true;
667   case InstArithmetic::Shl:
668   case InstArithmetic::Lshr:
669   case InstArithmetic::Ashr:
670     return false; // TODO(stichnot): implement
671     return !isI64;
672   }
673 }
674 
675 template <typename TraitsType>
676 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
677   if (A == B)
678     return true;
679   if (auto *MemA =
680           llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
681               A)) {
682     if (auto *MemB =
683             llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
684                 B)) {
685       return MemA->getBase() == MemB->getBase() &&
686              MemA->getOffset() == MemB->getOffset() &&
687              MemA->getIndex() == MemB->getIndex() &&
688              MemA->getShift() == MemB->getShift() &&
689              MemA->getSegmentRegister() == MemB->getSegmentRegister();
690     }
691   }
692   return false;
693 }
694 
695 template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() {
696   TimerMarker _(TimerStack::TT_findRMW, Func);
697   Func->dump("Before RMW");
698   if (Func->isVerbose(IceV_RMW))
699     Func->getContext()->lockStr();
700   for (CfgNode *Node : Func->getNodes()) {
701     // Walk through the instructions, considering each sequence of 3
702     // instructions, and look for the particular RMW pattern. Note that this
703     // search can be "broken" (false negatives) if there are intervening
704     // deleted instructions, or intervening instructions that could be safely
705     // moved out of the way to reveal an RMW pattern.
706     auto E = Node->getInsts().end();
707     auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
708     for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
709       // Make I3 skip over deleted instructions.
710       while (I3 != E && I3->isDeleted())
711         ++I3;
712       if (I1 == E || I2 == E || I3 == E)
713         continue;
714       assert(!I1->isDeleted());
715       assert(!I2->isDeleted());
716       assert(!I3->isDeleted());
717       auto *Load = llvm::dyn_cast<InstLoad>(I1);
718       auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
719       auto *Store = llvm::dyn_cast<InstStore>(I3);
720       if (!Load || !Arith || !Store)
721         continue;
722       // Look for:
723       //   a = Load addr
724       //   b = <op> a, other
725       //   Store b, addr
726       // Change to:
727       //   a = Load addr
728       //   b = <op> a, other
729       //   x = FakeDef
730       //   RMW <op>, addr, other, x
731       //   b = Store b, addr, x
732       // Note that inferTwoAddress() makes sure setDestRedefined() gets called
733       // on the updated Store instruction, to avoid liveness problems later.
734       //
735       // With this transformation, the Store instruction acquires a Dest
736       // variable and is now subject to dead code elimination if there are no
737       // more uses of "b".  Variable "x" is a beacon for determining whether the
738       // Store instruction gets dead-code eliminated.  If the Store instruction
739       // is eliminated, then it must be the case that the RMW instruction ends
740       // x's live range, and therefore the RMW instruction will be retained and
741       // later lowered.  On the other hand, if the RMW instruction does not end
742       // x's live range, then the Store instruction must still be present, and
743       // therefore the RMW instruction is ignored during lowering because it is
744       // redundant with the Store instruction.
745       //
746       // Note that if "a" has further uses, the RMW transformation may still
747       // trigger, resulting in two loads and one store, which is worse than the
748       // original one load and one store.  However, this is probably rare, and
749       // caching probably keeps it just as fast.
750       if (!isSameMemAddressOperand<TraitsType>(Load->getLoadAddress(),
751                                                Store->getStoreAddress()))
752         continue;
753       Operand *ArithSrcFromLoad = Arith->getSrc(0);
754       Operand *ArithSrcOther = Arith->getSrc(1);
755       if (ArithSrcFromLoad != Load->getDest()) {
756         if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
757           continue;
758         std::swap(ArithSrcFromLoad, ArithSrcOther);
759       }
760       if (Arith->getDest() != Store->getData())
761         continue;
762       if (!canRMW(Arith))
763         continue;
764       if (Func->isVerbose(IceV_RMW)) {
765         Ostream &Str = Func->getContext()->getStrDump();
766         Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
767         Load->dump(Func);
768         Str << "\n  ";
769         Arith->dump(Func);
770         Str << "\n  ";
771         Store->dump(Func);
772         Str << "\n";
773       }
774       Variable *Beacon = Func->makeVariable(IceType_i32);
775       Beacon->setMustNotHaveReg();
776       Store->setRmwBeacon(Beacon);
777       auto *BeaconDef = InstFakeDef::create(Func, Beacon);
778       Node->getInsts().insert(I3, BeaconDef);
779       auto *RMW =
780           InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
781                                  Beacon, Arith->getOp());
782       Node->getInsts().insert(I3, RMW);
783     }
784   }
785   if (Func->isVerbose(IceV_RMW))
786     Func->getContext()->unlockStr();
787 }
788 
789 // Converts a ConstantInteger32 operand into its constant value, or
790 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
791 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
792   if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
793     return Integer->getValue();
794   return Intrinsics::MemoryOrderInvalid;
795 }
796 
797 /// Determines whether the dest of a Load instruction can be folded into one of
798 /// the src operands of a 2-operand instruction. This is true as long as the
799 /// load dest matches exactly one of the binary instruction's src operands.
800 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
801 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
802                                       Operand *&Src0, Operand *&Src1) {
803   if (Src0 == LoadDest && Src1 != LoadDest) {
804     Src0 = LoadSrc;
805     return true;
806   }
807   if (Src0 != LoadDest && Src1 == LoadDest) {
808     Src1 = LoadSrc;
809     return true;
810   }
811   return false;
812 }
813 
814 template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() {
815   TimerMarker _(TimerStack::TT_loadOpt, Func);
816   for (CfgNode *Node : Func->getNodes()) {
817     Context.init(Node);
818     while (!Context.atEnd()) {
819       Variable *LoadDest = nullptr;
820       Operand *LoadSrc = nullptr;
821       Inst *CurInst = iteratorToInst(Context.getCur());
822       Inst *Next = Context.getNextInst();
823       // Determine whether the current instruction is a Load instruction or
824       // equivalent.
825       if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
826         // An InstLoad qualifies unless it uses a 64-bit absolute address,
827         // which requires legalization to insert a copy to register.
828         // TODO(b/148272103): Fold these after legalization.
829         if (!Traits::Is64Bit || !llvm::isa<Constant>(Load->getLoadAddress())) {
830           LoadDest = Load->getDest();
831           constexpr bool DoLegalize = false;
832           LoadSrc = formMemoryOperand(Load->getLoadAddress(),
833                                       LoadDest->getType(), DoLegalize);
834         }
835       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
836         // An AtomicLoad intrinsic qualifies as long as it has a valid memory
837         // ordering, and can be implemented in a single instruction (i.e., not
838         // i64 on x86-32).
839         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
840         if (ID == Intrinsics::AtomicLoad &&
841             (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
842             Intrinsics::isMemoryOrderValid(
843                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
844           LoadDest = Intrin->getDest();
845           constexpr bool DoLegalize = false;
846           LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
847                                       DoLegalize);
848         }
849       }
850       // A Load instruction can be folded into the following instruction only
851       // if the following instruction ends the Load's Dest variable's live
852       // range.
853       if (LoadDest && Next && Next->isLastUse(LoadDest)) {
854         assert(LoadSrc);
855         Inst *NewInst = nullptr;
856         if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
857           Operand *Src0 = Arith->getSrc(0);
858           Operand *Src1 = Arith->getSrc(1);
859           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
860             NewInst = InstArithmetic::create(Func, Arith->getOp(),
861                                              Arith->getDest(), Src0, Src1);
862           }
863         } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
864           Operand *Src0 = Icmp->getSrc(0);
865           Operand *Src1 = Icmp->getSrc(1);
866           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
867             NewInst = InstIcmp::create(Func, Icmp->getCondition(),
868                                        Icmp->getDest(), Src0, Src1);
869           }
870         } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
871           Operand *Src0 = Fcmp->getSrc(0);
872           Operand *Src1 = Fcmp->getSrc(1);
873           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
874             NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
875                                        Fcmp->getDest(), Src0, Src1);
876           }
877         } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
878           Operand *Src0 = Select->getTrueOperand();
879           Operand *Src1 = Select->getFalseOperand();
880           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
881             NewInst = InstSelect::create(Func, Select->getDest(),
882                                          Select->getCondition(), Src0, Src1);
883           }
884         } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
885           // The load dest can always be folded into a Cast instruction.
886           auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
887           if (Src0 == LoadDest) {
888             NewInst = InstCast::create(Func, Cast->getCastKind(),
889                                        Cast->getDest(), LoadSrc);
890           }
891         }
892         if (NewInst) {
893           CurInst->setDeleted();
894           Next->setDeleted();
895           Context.insert(NewInst);
896           // Update NewInst->LiveRangesEnded so that target lowering may
897           // benefit. Also update NewInst->HasSideEffects.
898           NewInst->spliceLivenessInfo(Next, CurInst);
899         }
900       }
901       Context.advanceCur();
902       Context.advanceNext();
903     }
904   }
905   Func->dump("After load optimization");
906 }
907 
908 template <typename TraitsType>
909 bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
910   if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
911     return Br->optimizeBranch(NextNode);
912   }
913   return false;
914 }
915 
916 template <typename TraitsType>
917 Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum,
918                                                          Type Ty) {
919   if (Ty == IceType_void)
920     Ty = IceType_i32;
921   if (PhysicalRegisters[Ty].empty())
922     PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
923   assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
924   Variable *Reg = PhysicalRegisters[Ty][RegNum];
925   if (Reg == nullptr) {
926     Reg = Func->makeVariable(Ty);
927     Reg->setRegNum(RegNum);
928     PhysicalRegisters[Ty][RegNum] = Reg;
929     // Specially mark a named physical register as an "argument" so that it is
930     // considered live upon function entry.  Otherwise it's possible to get
931     // liveness validation errors for saving callee-save registers.
932     Func->addImplicitArg(Reg);
933     // Don't bother tracking the live range of a named physical register.
934     Reg->setIgnoreLiveness();
935   }
936   assert(Traits::getGprForType(Ty, RegNum) == RegNum);
937   return Reg;
938 }
939 
940 template <typename TraitsType>
941 const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum,
942                                                   Type Ty) const {
943   return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
944 }
945 
946 template <typename TraitsType>
947 void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const {
948   if (!BuildDefs::dump())
949     return;
950   Ostream &Str = Ctx->getStrEmit();
951   if (Var->hasReg()) {
952     const bool Is64BitSandboxing = Traits::Is64Bit && NeedSandboxing;
953     const Type VarType = (Var->isRematerializable() && Is64BitSandboxing)
954                              ? IceType_i64
955                              : Var->getType();
956     Str << "%" << getRegName(Var->getRegNum(), VarType);
957     return;
958   }
959   if (Var->mustHaveReg()) {
960     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
961                              ") has no register assigned - function " +
962                              Func->getFunctionName());
963   }
964   const int32_t Offset = Var->getStackOffset();
965   auto BaseRegNum = Var->getBaseRegNum();
966   if (BaseRegNum.hasNoValue())
967     BaseRegNum = getFrameOrStackReg();
968 
969   // Print in the form "Offset(%reg)", omitting Offset when it is 0.
970   if (getFlags().getDecorateAsm()) {
971     Str << Var->getSymbolicStackOffset();
972   } else if (Offset != 0) {
973     Str << Offset;
974   }
975   const Type FrameSPTy = Traits::WordType;
976   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
977 }
978 
979 template <typename TraitsType>
980 typename TargetX86Base<TraitsType>::X86Address
981 TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const {
982   if (Var->hasReg())
983     llvm::report_fatal_error("Stack Variable has a register assigned");
984   if (Var->mustHaveReg()) {
985     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
986                              ") has no register assigned - function " +
987                              Func->getFunctionName());
988   }
989   int32_t Offset = Var->getStackOffset();
990   auto BaseRegNum = Var->getBaseRegNum();
991   if (Var->getBaseRegNum().hasNoValue()) {
992     // If the stack pointer needs alignment, we must use the frame pointer for
993     // arguments. For locals, getFrameOrStackReg will return the stack pointer
994     // in this case.
995     if (needsStackPointerAlignment() && Var->getIsArg()) {
996       assert(hasFramePointer());
997       BaseRegNum = getFrameReg();
998     } else {
999       BaseRegNum = getFrameOrStackReg();
1000     }
1001   }
1002   return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
1003                     AssemblerFixup::NoFixup);
1004 }
1005 
1006 template <typename TraitsType>
1007 void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
1008   // Stack frame layout:
1009   //
1010   // +------------------------+  ^ +
1011   // | 1. return address      |  |
1012   // +------------------------+  v -
1013   // | 2. preserved registers |
1014   // +------------------------+ <--- BasePointer (if used)
1015   // | 3. padding             |
1016   // +------------------------+
1017   // | 4. global spill area   |
1018   // +------------------------+
1019   // | 5. padding             |
1020   // +------------------------+
1021   // | 6. local spill area    |
1022   // +------------------------+
1023   // | 7. padding             |
1024   // +------------------------+
1025   // | 7.5 shadow (WinX64)    |
1026   // +------------------------+
1027   // | 8. allocas             |
1028   // +------------------------+
1029   // | 9. padding             |
1030   // +------------------------+
1031   // | 10. out args           |
1032   // +------------------------+ <--- StackPointer
1033   //
1034   // The following variables record the size in bytes of the given areas:
1035   //  * X86_RET_IP_SIZE_BYTES:   area 1
1036   //  * PreservedRegsSizeBytes:  area 2
1037   //  * SpillAreaPaddingBytes:   area 3
1038   //  * GlobalsSize:             area 4
1039   //  * LocalsSlotsPaddingBytes: area 5
1040   //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
1041   //  * LocalsSpillAreaSize:     area 6
1042   //  * FixedAllocaSizeBytes:    areas 7 - 8
1043   //  * SpillAreaSizeBytes:      areas 3 - 10
1044   //  * maxOutArgsSizeBytes():   areas 9 - 10
1045 
1046   // Determine stack frame offsets for each Variable without a register
1047   // assignment. This can be done as one variable per stack slot. Or, do
1048   // coalescing by running the register allocator again with an infinite set of
1049   // registers (as a side effect, this gives variables a second chance at
1050   // physical register assignment).
1051   //
1052   // A middle ground approach is to leverage sparsity and allocate one block of
1053   // space on the frame for globals (variables with multi-block lifetime), and
1054   // one block to share for locals (single-block lifetime).
1055 
1056   const SizeT ShadowStoreSize = getShadowStoreSize<Traits>();
1057 
1058   // StackPointer: points just past return address of calling function
1059 
1060   Context.init(Node);
1061   Context.setInsertPoint(Context.getCur());
1062 
1063   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1064   RegsUsed = SmallBitVector(CalleeSaves.size());
1065   VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
1066   size_t GlobalsSize = 0;
1067   // If there is a separate locals area, this represents that area. Otherwise
1068   // it counts any variable not counted by GlobalsSize.
1069   SpillAreaSizeBytes = 0;
1070   // If there is a separate locals area, this specifies the alignment for it.
1071   uint32_t LocalsSlotsAlignmentBytes = 0;
1072   // The entire spill locations area gets aligned to largest natural alignment
1073   // of the variables that have a spill slot.
1074   uint32_t SpillAreaAlignmentBytes = 0;
1075   // A spill slot linked to a variable with a stack slot should reuse that
1076   // stack slot.
1077   std::function<bool(Variable *)> TargetVarHook =
1078       [&VariablesLinkedToSpillSlots](Variable *Var) {
1079         // TODO(stichnot): Refactor this into the base class.
1080         Variable *Root = Var->getLinkedToStackRoot();
1081         if (Root != nullptr) {
1082           assert(!Root->hasReg());
1083           if (!Root->hasReg()) {
1084             VariablesLinkedToSpillSlots.push_back(Var);
1085             return true;
1086           }
1087         }
1088         return false;
1089       };
1090 
1091   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
1092   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
1093                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
1094                         &LocalsSlotsAlignmentBytes, TargetVarHook);
1095   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
1096   SpillAreaSizeBytes += GlobalsSize;
1097 
1098   // Add push instructions for preserved registers.
1099   uint32_t NumCallee = 0;
1100   size_t PreservedRegsSizeBytes = 0;
1101   SmallBitVector Pushed(CalleeSaves.size());
1102   for (RegNumT i : RegNumBVIter(CalleeSaves)) {
1103     const auto Canonical = Traits::getBaseReg(i);
1104     assert(Canonical == Traits::getBaseReg(Canonical));
1105     if (RegsUsed[i]) {
1106       Pushed[Canonical] = true;
1107     }
1108   }
1109   for (RegNumT RegNum : RegNumBVIter(Pushed)) {
1110     assert(RegNum == Traits::getBaseReg(RegNum));
1111     ++NumCallee;
1112     if (Traits::isXmm(RegNum)) {
1113       PreservedRegsSizeBytes += 16;
1114     } else {
1115       PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
1116     }
1117     _push_reg(RegNum);
1118   }
1119   Ctx->statsUpdateRegistersSaved(NumCallee);
1120 
1121   // StackPointer: points past preserved registers at start of spill area
1122 
1123   // Generate "push frameptr; mov frameptr, stackptr"
1124   if (IsEbpBasedFrame) {
1125     assert(
1126         (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
1127         0);
1128     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
1129     _link_bp();
1130   }
1131 
1132   // Align the variables area. SpillAreaPaddingBytes is the size of the region
1133   // after the preserved registers and before the spill areas.
1134   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
1135   // locals area if they are separate.
1136   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
1137   uint32_t SpillAreaPaddingBytes = 0;
1138   uint32_t LocalsSlotsPaddingBytes = 0;
1139   alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
1140                        SpillAreaAlignmentBytes, GlobalsSize,
1141                        LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
1142                        &LocalsSlotsPaddingBytes);
1143   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
1144   uint32_t GlobalsAndSubsequentPaddingSize =
1145       GlobalsSize + LocalsSlotsPaddingBytes;
1146 
1147   // Functions returning scalar floating point types may need to convert values
1148   // from an in-register xmm value to the top of the x87 floating point stack.
1149   // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
1150   // space on the stack for this.
1151   const Type ReturnType = Func->getReturnType();
1152   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
1153     if (isScalarFloatingType(ReturnType)) {
1154       // Avoid misaligned double-precision load/store.
1155       RequiredStackAlignment = std::max<size_t>(
1156           RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
1157       SpillAreaSizeBytes =
1158           std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
1159     }
1160   }
1161 
1162   RequiredStackAlignment =
1163       std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
1164 
1165   if (PrologEmitsFixedAllocas) {
1166     RequiredStackAlignment =
1167         std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
1168   }
1169 
1170   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
1171   // fixed allocations in the prolog.
1172   if (PrologEmitsFixedAllocas)
1173     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1174 
1175   // Win64 ABI: add space for shadow store (aka home space)
1176   SpillAreaSizeBytes += ShadowStoreSize;
1177 
1178   // Entering the function has made the stack pointer unaligned. Re-align it by
1179   // adjusting the stack size.
1180   // Note that StackOffset does not include spill area. It's the offset from the
1181   // base stack pointer (epb), whether we set it or not, to the the first stack
1182   // arg (if any). StackSize, on the other hand, does include the spill area.
1183   const uint32_t StackOffset =
1184       ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
1185   uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
1186                                              RequiredStackAlignment);
1187   StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
1188                                     RequiredStackAlignment);
1189   SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
1190 
1191   if (SpillAreaSizeBytes) {
1192     auto *Func = Node->getCfg();
1193     if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
1194       Func->setError("Stack size limit exceeded");
1195     }
1196 
1197     emitStackProbe(SpillAreaSizeBytes);
1198 
1199     // Generate "sub stackptr, SpillAreaSizeBytes"
1200     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1201   }
1202 
1203   // StackPointer: points just past the spill area (end of stack frame)
1204 
1205   // If the required alignment is greater than the stack pointer's guaranteed
1206   // alignment, align the stack pointer accordingly.
1207   if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
1208     assert(IsEbpBasedFrame);
1209     _and(getPhysicalRegister(getStackReg(), Traits::WordType),
1210          Ctx->getConstantInt32(-RequiredStackAlignment));
1211   }
1212 
1213   // StackPointer: may have just been offset for alignment
1214 
1215   // Account for known-frame-offset alloca instructions that were not already
1216   // combined into the prolog.
1217   if (!PrologEmitsFixedAllocas)
1218     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1219 
1220   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1221 
1222   // Fill in stack offsets for stack args, and copy args into registers for
1223   // those that were register-allocated. Args are pushed right to left, so
1224   // Arg[0] is closest to the stack/frame pointer.
1225   RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
1226   Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
1227   size_t BasicFrameOffset = StackOffset;
1228   if (!IsEbpBasedFrame)
1229     BasicFrameOffset += SpillAreaSizeBytes;
1230 
1231   emitGetIP(Node);
1232 
1233   const VarList &Args = Func->getArgs();
1234   size_t InArgsSizeBytes = 0;
1235   unsigned NumXmmArgs = 0;
1236   unsigned NumGPRArgs = 0;
1237   for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
1238     Variable *Arg = Args[i];
1239     // Skip arguments passed in registers.
1240     if (isVectorType(Arg->getType())) {
1241       if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
1242               .hasValue()) {
1243         ++NumXmmArgs;
1244         continue;
1245       }
1246     } else if (isScalarFloatingType(Arg->getType())) {
1247       if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
1248           Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
1249               .hasValue()) {
1250         ++NumXmmArgs;
1251         continue;
1252       }
1253     } else {
1254       assert(isScalarIntegerType(Arg->getType()));
1255       if (Traits::getRegisterForGprArgNum(Traits::WordType,
1256                                           Traits::getArgIndex(i, NumGPRArgs))
1257               .hasValue()) {
1258         ++NumGPRArgs;
1259         continue;
1260       }
1261     }
1262     // For esp-based frames where the allocas are done outside the prolog, the
1263     // esp value may not stabilize to its home value until after all the
1264     // fixed-size alloca instructions have executed.  In this case, a stack
1265     // adjustment is needed when accessing in-args in order to copy them into
1266     // registers.
1267     size_t StackAdjBytes = 0;
1268     if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
1269       StackAdjBytes -= FixedAllocaSizeBytes;
1270     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
1271                            InArgsSizeBytes);
1272   }
1273 
1274   // Fill in stack offsets for locals.
1275   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1276                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1277                       IsEbpBasedFrame && !needsStackPointerAlignment());
1278   // Assign stack offsets to variables that have been linked to spilled
1279   // variables.
1280   for (Variable *Var : VariablesLinkedToSpillSlots) {
1281     const Variable *Root = Var->getLinkedToStackRoot();
1282     assert(Root != nullptr);
1283     Var->setStackOffset(Root->getStackOffset());
1284 
1285     // If the stack root variable is an arg, make this variable an arg too so
1286     // that stackVarToAsmOperand uses the correct base pointer (e.g. ebp on
1287     // x86).
1288     Var->setIsArg(Root->getIsArg());
1289   }
1290   this->HasComputedFrame = true;
1291 
1292   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
1293     OstreamLocker L(Func->getContext());
1294     Ostream &Str = Func->getContext()->getStrDump();
1295 
1296     Str << "Stack layout:\n";
1297     uint32_t EspAdjustmentPaddingSize =
1298         SpillAreaSizeBytes - LocalsSpillAreaSize -
1299         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
1300         maxOutArgsSizeBytes();
1301     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1302         << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
1303         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1304         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1305         << " globals spill area = " << GlobalsSize << " bytes\n"
1306         << " globals-locals spill areas intermediate padding = "
1307         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1308         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1309         << " esp alignment padding = " << EspAdjustmentPaddingSize
1310         << " bytes\n";
1311 
1312     Str << "Stack details:\n"
1313         << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1314         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1315         << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
1316         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1317         << " bytes\n"
1318         << " is ebp based = " << IsEbpBasedFrame << "\n";
1319   }
1320 }
1321 
1322 /// Helper function for addProlog().
1323 ///
1324 /// This assumes Arg is an argument passed on the stack. This sets the frame
1325 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
1326 /// I64 arg that has been split into Lo and Hi components, it calls itself
1327 /// recursively on the components, taking care to handle Lo first because of the
1328 /// little-endian architecture. Lastly, this function generates an instruction
1329 /// to copy Arg into its assigned register if applicable.
1330 template <typename TraitsType>
1331 void TargetX86Base<TraitsType>::finishArgumentLowering(
1332     Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset,
1333     size_t StackAdjBytes, size_t &InArgsSizeBytes) {
1334   if (!Traits::Is64Bit) {
1335     if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
1336       Variable *Lo = Arg64On32->getLo();
1337       Variable *Hi = Arg64On32->getHi();
1338       finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
1339                              InArgsSizeBytes);
1340       finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
1341                              InArgsSizeBytes);
1342       return;
1343     }
1344   }
1345   Type Ty = Arg->getType();
1346   if (isVectorType(Ty)) {
1347     InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
1348   }
1349   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
1350   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
1351   if (Arg->hasReg()) {
1352     assert(Ty != IceType_i64 || Traits::Is64Bit);
1353     auto *Mem = X86OperandMem::create(
1354         Func, Ty, FramePtr,
1355         Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
1356     if (isVectorType(Arg->getType())) {
1357       _movp(Arg, Mem);
1358     } else {
1359       _mov(Arg, Mem);
1360     }
1361     // This argument-copying instruction uses an explicit X86OperandMem
1362     // operand instead of a Variable, so its fill-from-stack operation has to
1363     // be tracked separately for statistics.
1364     Ctx->statsUpdateFills();
1365   }
1366 }
1367 
1368 template <typename TraitsType>
1369 void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) {
1370   InstList &Insts = Node->getInsts();
1371   InstList::reverse_iterator RI, E;
1372   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1373     if (llvm::isa<typename Traits::Insts::Ret>(*RI))
1374       break;
1375   }
1376   if (RI == E)
1377     return;
1378 
1379   // Convert the reverse_iterator position into its corresponding (forward)
1380   // iterator position.
1381   InstList::iterator InsertPoint = reverseToForwardIterator(RI);
1382   --InsertPoint;
1383   Context.init(Node);
1384   Context.setInsertPoint(InsertPoint);
1385 
1386   if (IsEbpBasedFrame) {
1387     _unlink_bp();
1388   } else {
1389     // add stackptr, SpillAreaSizeBytes
1390     if (SpillAreaSizeBytes != 0) {
1391       _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1392     }
1393   }
1394 
1395   // Add pop instructions for preserved registers.
1396   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1397   SmallBitVector Popped(CalleeSaves.size());
1398   for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
1399     const auto RegNum = RegNumT::fromInt(i);
1400     if (RegNum == getFrameReg() && IsEbpBasedFrame)
1401       continue;
1402     const RegNumT Canonical = Traits::getBaseReg(RegNum);
1403     if (CalleeSaves[i] && RegsUsed[i]) {
1404       Popped[Canonical] = true;
1405     }
1406   }
1407   for (int32_t i = Popped.size() - 1; i >= 0; --i) {
1408     if (!Popped[i])
1409       continue;
1410     const auto RegNum = RegNumT::fromInt(i);
1411     assert(RegNum == Traits::getBaseReg(RegNum));
1412     _pop_reg(RegNum);
1413   }
1414 
1415   if (!NeedSandboxing) {
1416     return;
1417   }
1418   emitSandboxedReturn();
1419   if (RI->getSrcSize()) {
1420     auto *RetValue = llvm::cast<Variable>(RI->getSrc(0));
1421     Context.insert<InstFakeUse>(RetValue);
1422   }
1423   RI->setDeleted();
1424 }
1425 
1426 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
1427   return Traits::WordType;
1428 }
1429 
1430 template <typename TraitsType>
1431 template <typename T>
1432 typename std::enable_if<!T::Is64Bit, Operand>::type *
1433 TargetX86Base<TraitsType>::loOperand(Operand *Operand) {
1434   assert(Operand->getType() == IceType_i64 ||
1435          Operand->getType() == IceType_f64);
1436   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1437     return Operand;
1438   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1439     return Var64On32->getLo();
1440   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1441     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1442         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
1443     // Check if we need to blind/pool the constant.
1444     return legalize(ConstInt);
1445   }
1446   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1447     auto *MemOperand = X86OperandMem::create(
1448         Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
1449         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1450     // Test if we should randomize or pool the offset, if so randomize it or
1451     // pool it then create mem operand with the blinded/pooled constant.
1452     // Otherwise, return the mem operand as ordinary mem operand.
1453     return legalize(MemOperand);
1454   }
1455   llvm_unreachable("Unsupported operand type");
1456   return nullptr;
1457 }
1458 
1459 template <typename TraitsType>
1460 template <typename T>
1461 typename std::enable_if<!T::Is64Bit, Operand>::type *
1462 TargetX86Base<TraitsType>::hiOperand(Operand *Operand) {
1463   assert(Operand->getType() == IceType_i64 ||
1464          Operand->getType() == IceType_f64);
1465   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1466     return Operand;
1467   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1468     return Var64On32->getHi();
1469   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1470     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1471         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
1472     // Check if we need to blind/pool the constant.
1473     return legalize(ConstInt);
1474   }
1475   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1476     Constant *Offset = Mem->getOffset();
1477     if (Offset == nullptr) {
1478       Offset = Ctx->getConstantInt32(4);
1479     } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
1480       Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
1481     } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
1482       assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
1483       Offset =
1484           Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
1485     }
1486     auto *MemOperand = X86OperandMem::create(
1487         Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
1488         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1489     // Test if the Offset is an eligible i32 constants for randomization and
1490     // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
1491     // operand.
1492     return legalize(MemOperand);
1493   }
1494   llvm_unreachable("Unsupported operand type");
1495   return nullptr;
1496 }
1497 
1498 template <typename TraitsType>
1499 SmallBitVector
1500 TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include,
1501                                           RegSetMask Exclude) const {
1502   return Traits::getRegisterSet(getFlags(), Include, Exclude);
1503 }
1504 
1505 template <typename TraitsType>
1506 void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) {
1507   // Conservatively require the stack to be aligned. Some stack adjustment
1508   // operations implemented below assume that the stack is aligned before the
1509   // alloca. All the alloca code ensures that the stack alignment is preserved
1510   // after the alloca. The stack alignment restriction can be relaxed in some
1511   // cases.
1512   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
1513                                             Traits::X86_STACK_ALIGNMENT_BYTES);
1514 
1515   // For default align=0, set it to the real value 1, to avoid any
1516   // bit-manipulation problems below.
1517   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
1518 
1519   // LLVM enforces power of 2 alignment.
1520   assert(llvm::isPowerOf2_32(AlignmentParam));
1521   assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
1522 
1523   const uint32_t Alignment =
1524       std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
1525   const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
1526   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1527   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
1528   const bool UseFramePointer =
1529       hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
1530 
1531   if (UseFramePointer)
1532     setHasFramePointer();
1533 
1534   Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
1535   if (OverAligned) {
1536     _and(esp, Ctx->getConstantInt32(-Alignment));
1537   }
1538 
1539   Variable *Dest = Instr->getDest();
1540   Operand *TotalSize = legalize(Instr->getSizeInBytes());
1541 
1542   if (const auto *ConstantTotalSize =
1543           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1544     const uint32_t Value =
1545         Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
1546     if (UseFramePointer) {
1547       _sub_sp(Ctx->getConstantInt32(Value));
1548     } else {
1549       // If we don't need a Frame Pointer, this alloca has a known offset to the
1550       // stack pointer. We don't need adjust the stack pointer, nor assign any
1551       // value to Dest, as Dest is rematerializable.
1552       assert(Dest->isRematerializable());
1553       FixedAllocaSizeBytes += Value;
1554       Context.insert<InstFakeDef>(Dest);
1555     }
1556   } else {
1557     // Non-constant sizes need to be adjusted to the next highest multiple of
1558     // the required alignment at runtime.
1559     Variable *T = nullptr;
1560     if (Traits::Is64Bit && TotalSize->getType() != IceType_i64 &&
1561         !NeedSandboxing) {
1562       T = makeReg(IceType_i64);
1563       _movzx(T, TotalSize);
1564     } else {
1565       T = makeReg(IceType_i32);
1566       _mov(T, TotalSize);
1567     }
1568     _add(T, Ctx->getConstantInt32(Alignment - 1));
1569     _and(T, Ctx->getConstantInt32(-Alignment));
1570     _sub_sp(T);
1571   }
1572   // Add enough to the returned address to account for the out args area.
1573   uint32_t OutArgsSize = maxOutArgsSizeBytes();
1574   if (OutArgsSize > 0) {
1575     Variable *T = makeReg(Dest->getType());
1576     auto *CalculateOperand = X86OperandMem::create(
1577         Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
1578     _lea(T, CalculateOperand);
1579     _mov(Dest, T);
1580   } else {
1581     _mov(Dest, esp);
1582   }
1583 }
1584 
1585 template <typename TraitsType>
1586 void TargetX86Base<TraitsType>::lowerArguments() {
1587   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1588   VarList &Args = Func->getArgs();
1589   unsigned NumXmmArgs = 0;
1590   bool XmmSlotsRemain = true;
1591   unsigned NumGprArgs = 0;
1592   bool GprSlotsRemain = true;
1593 
1594   Context.init(Func->getEntryNode());
1595   Context.setInsertPoint(Context.getCur());
1596 
1597   for (SizeT i = 0, End = Args.size();
1598        i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
1599     Variable *Arg = Args[i];
1600     Type Ty = Arg->getType();
1601     Variable *RegisterArg = nullptr;
1602     RegNumT RegNum;
1603     if (isVectorType(Ty)) {
1604       RegNum =
1605           Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
1606       if (RegNum.hasNoValue()) {
1607         XmmSlotsRemain = false;
1608         continue;
1609       }
1610       ++NumXmmArgs;
1611       RegisterArg = Func->makeVariable(Ty);
1612     } else if (isScalarFloatingType(Ty)) {
1613       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
1614         continue;
1615       }
1616       RegNum =
1617           Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
1618       if (RegNum.hasNoValue()) {
1619         XmmSlotsRemain = false;
1620         continue;
1621       }
1622       ++NumXmmArgs;
1623       RegisterArg = Func->makeVariable(Ty);
1624     } else if (isScalarIntegerType(Ty)) {
1625       RegNum = Traits::getRegisterForGprArgNum(
1626           Ty, Traits::getArgIndex(i, NumGprArgs));
1627       if (RegNum.hasNoValue()) {
1628         GprSlotsRemain = false;
1629         continue;
1630       }
1631       ++NumGprArgs;
1632       RegisterArg = Func->makeVariable(Ty);
1633     }
1634     assert(RegNum.hasValue());
1635     assert(RegisterArg != nullptr);
1636     // Replace Arg in the argument list with the home register. Then generate
1637     // an instruction in the prolog to copy the home register to the assigned
1638     // location of Arg.
1639     if (BuildDefs::dump())
1640       RegisterArg->setName(Func, "home_reg:" + Arg->getName());
1641     RegisterArg->setRegNum(RegNum);
1642     RegisterArg->setIsArg();
1643     Arg->setIsArg(false);
1644 
1645     Args[i] = RegisterArg;
1646     // When not Om1, do the assignment through a temporary, instead of directly
1647     // from the pre-colored variable, so that a subsequent availabilityGet()
1648     // call has a chance to work.  (In Om1, don't bother creating extra
1649     // instructions with extra variables to register-allocate.)
1650     if (OptM1) {
1651       Context.insert<InstAssign>(Arg, RegisterArg);
1652     } else {
1653       Variable *Tmp = makeReg(RegisterArg->getType());
1654       Context.insert<InstAssign>(Tmp, RegisterArg);
1655       Context.insert<InstAssign>(Arg, Tmp);
1656     }
1657   }
1658   if (!OptM1)
1659     Context.availabilityUpdate();
1660 }
1661 
1662 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
1663 /// narrower) for certain constants. The lea instruction can be used to multiply
1664 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
1665 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
1666 /// lea-based multiplies by 5, combined with left-shifting by 2.
1667 template <typename TraitsType>
1668 bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0,
1669                                                   int32_t Src1) {
1670   // Disable this optimization for Om1 and O0, just to keep things simple
1671   // there.
1672   if (Func->getOptLevel() < Opt_1)
1673     return false;
1674   Type Ty = Dest->getType();
1675   if (Src1 == -1) {
1676     Variable *T = nullptr;
1677     _mov(T, Src0);
1678     _neg(T);
1679     _mov(Dest, T);
1680     return true;
1681   }
1682   if (Src1 == 0) {
1683     _mov(Dest, Ctx->getConstantZero(Ty));
1684     return true;
1685   }
1686   if (Src1 == 1) {
1687     Variable *T = nullptr;
1688     _mov(T, Src0);
1689     _mov(Dest, T);
1690     return true;
1691   }
1692   // Don't bother with the edge case where Src1 == MININT.
1693   if (Src1 == -Src1)
1694     return false;
1695   const bool Src1IsNegative = Src1 < 0;
1696   if (Src1IsNegative)
1697     Src1 = -Src1;
1698   uint32_t Count9 = 0;
1699   uint32_t Count5 = 0;
1700   uint32_t Count3 = 0;
1701   uint32_t Count2 = 0;
1702   uint32_t CountOps = 0;
1703   while (Src1 > 1) {
1704     if (Src1 % 9 == 0) {
1705       ++CountOps;
1706       ++Count9;
1707       Src1 /= 9;
1708     } else if (Src1 % 5 == 0) {
1709       ++CountOps;
1710       ++Count5;
1711       Src1 /= 5;
1712     } else if (Src1 % 3 == 0) {
1713       ++CountOps;
1714       ++Count3;
1715       Src1 /= 3;
1716     } else if (Src1 % 2 == 0) {
1717       if (Count2 == 0)
1718         ++CountOps;
1719       ++Count2;
1720       Src1 /= 2;
1721     } else {
1722       return false;
1723     }
1724   }
1725   // Lea optimization only works for i16 and i32 types, not i8.
1726   if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
1727       (Count3 || Count5 || Count9))
1728     return false;
1729   // Limit the number of lea/shl operations for a single multiply, to a
1730   // somewhat arbitrary choice of 3.
1731   constexpr uint32_t MaxOpsForOptimizedMul = 3;
1732   if (CountOps > MaxOpsForOptimizedMul)
1733     return false;
1734   Variable *T = makeReg(Traits::WordType);
1735   if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
1736     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
1737     _movzx(T, Src0RM);
1738   } else {
1739     _mov(T, Src0);
1740   }
1741   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1742   for (uint32_t i = 0; i < Count9; ++i) {
1743     constexpr uint16_t Shift = 3; // log2(9-1)
1744     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1745   }
1746   for (uint32_t i = 0; i < Count5; ++i) {
1747     constexpr uint16_t Shift = 2; // log2(5-1)
1748     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1749   }
1750   for (uint32_t i = 0; i < Count3; ++i) {
1751     constexpr uint16_t Shift = 1; // log2(3-1)
1752     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1753   }
1754   if (Count2) {
1755     _shl(T, Ctx->getConstantInt(Ty, Count2));
1756   }
1757   if (Src1IsNegative)
1758     _neg(T);
1759   _mov(Dest, T);
1760   return true;
1761 }
1762 
1763 template <typename TraitsType>
1764 void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op,
1765                                              Operand *Src0Lo, Operand *Src0Hi,
1766                                              Operand *Src1Lo, Variable *DestLo,
1767                                              Variable *DestHi) {
1768   // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1769   Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1770   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1771   Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1772   if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
1773     uint32_t ShiftAmount = ConstantShiftAmount->getValue();
1774     if (ShiftAmount > 32) {
1775       Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
1776       switch (Op) {
1777       default:
1778         assert(0 && "non-shift op");
1779         break;
1780       case InstArithmetic::Shl: {
1781         // a=b<<c ==>
1782         //   t2 = b.lo
1783         //   t2 = shl t2, ShiftAmount-32
1784         //   t3 = t2
1785         //   t2 = 0
1786         _mov(T_2, Src0Lo);
1787         _shl(T_2, ReducedShift);
1788         _mov(DestHi, T_2);
1789         _mov(DestLo, Zero);
1790       } break;
1791       case InstArithmetic::Lshr: {
1792         // a=b>>c (unsigned) ==>
1793         //   t2 = b.hi
1794         //   t2 = shr t2, ShiftAmount-32
1795         //   a.lo = t2
1796         //   a.hi = 0
1797         _mov(T_2, Src0Hi);
1798         _shr(T_2, ReducedShift);
1799         _mov(DestLo, T_2);
1800         _mov(DestHi, Zero);
1801       } break;
1802       case InstArithmetic::Ashr: {
1803         // a=b>>c (signed) ==>
1804         //   t3 = b.hi
1805         //   t3 = sar t3, 0x1f
1806         //   t2 = b.hi
1807         //   t2 = shrd t2, t3, ShiftAmount-32
1808         //   a.lo = t2
1809         //   a.hi = t3
1810         _mov(T_3, Src0Hi);
1811         _sar(T_3, SignExtend);
1812         _mov(T_2, Src0Hi);
1813         _shrd(T_2, T_3, ReducedShift);
1814         _mov(DestLo, T_2);
1815         _mov(DestHi, T_3);
1816       } break;
1817       }
1818     } else if (ShiftAmount == 32) {
1819       switch (Op) {
1820       default:
1821         assert(0 && "non-shift op");
1822         break;
1823       case InstArithmetic::Shl: {
1824         // a=b<<c ==>
1825         //   t2 = b.lo
1826         //   a.hi = t2
1827         //   a.lo = 0
1828         _mov(T_2, Src0Lo);
1829         _mov(DestHi, T_2);
1830         _mov(DestLo, Zero);
1831       } break;
1832       case InstArithmetic::Lshr: {
1833         // a=b>>c (unsigned) ==>
1834         //   t2 = b.hi
1835         //   a.lo = t2
1836         //   a.hi = 0
1837         _mov(T_2, Src0Hi);
1838         _mov(DestLo, T_2);
1839         _mov(DestHi, Zero);
1840       } break;
1841       case InstArithmetic::Ashr: {
1842         // a=b>>c (signed) ==>
1843         //   t2 = b.hi
1844         //   a.lo = t2
1845         //   t3 = b.hi
1846         //   t3 = sar t3, 0x1f
1847         //   a.hi = t3
1848         _mov(T_2, Src0Hi);
1849         _mov(DestLo, T_2);
1850         _mov(T_3, Src0Hi);
1851         _sar(T_3, SignExtend);
1852         _mov(DestHi, T_3);
1853       } break;
1854       }
1855     } else {
1856       // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1857       //   t2 = b.lo
1858       //   t3 = b.hi
1859       _mov(T_2, Src0Lo);
1860       _mov(T_3, Src0Hi);
1861       switch (Op) {
1862       default:
1863         assert(0 && "non-shift op");
1864         break;
1865       case InstArithmetic::Shl: {
1866         // a=b<<c ==>
1867         //   t3 = shld t3, t2, ShiftAmount
1868         //   t2 = shl t2, ShiftAmount
1869         _shld(T_3, T_2, ConstantShiftAmount);
1870         _shl(T_2, ConstantShiftAmount);
1871       } break;
1872       case InstArithmetic::Lshr: {
1873         // a=b>>c (unsigned) ==>
1874         //   t2 = shrd t2, t3, ShiftAmount
1875         //   t3 = shr t3, ShiftAmount
1876         _shrd(T_2, T_3, ConstantShiftAmount);
1877         _shr(T_3, ConstantShiftAmount);
1878       } break;
1879       case InstArithmetic::Ashr: {
1880         // a=b>>c (signed) ==>
1881         //   t2 = shrd t2, t3, ShiftAmount
1882         //   t3 = sar t3, ShiftAmount
1883         _shrd(T_2, T_3, ConstantShiftAmount);
1884         _sar(T_3, ConstantShiftAmount);
1885       } break;
1886       }
1887       // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1888       //   a.lo = t2
1889       //   a.hi = t3
1890       _mov(DestLo, T_2);
1891       _mov(DestHi, T_3);
1892     }
1893   } else {
1894     // NON-CONSTANT CASES.
1895     Constant *BitTest = Ctx->getConstantInt32(0x20);
1896     InstX86Label *Label = InstX86Label::create(Func, this);
1897     // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1898     //   t1:ecx = c.lo & 0xff
1899     //   t2 = b.lo
1900     //   t3 = b.hi
1901     T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
1902     _mov(T_2, Src0Lo);
1903     _mov(T_3, Src0Hi);
1904     switch (Op) {
1905     default:
1906       assert(0 && "non-shift op");
1907       break;
1908     case InstArithmetic::Shl: {
1909       // a=b<<c ==>
1910       //   t3 = shld t3, t2, t1
1911       //   t2 = shl t2, t1
1912       //   test t1, 0x20
1913       //   je L1
1914       //   use(t3)
1915       //   t3 = t2
1916       //   t2 = 0
1917       _shld(T_3, T_2, T_1);
1918       _shl(T_2, T_1);
1919       _test(T_1, BitTest);
1920       _br(Traits::Cond::Br_e, Label);
1921       // T_2 and T_3 are being assigned again because of the intra-block control
1922       // flow, so we need to use _redefined to avoid liveness problems.
1923       _redefined(_mov(T_3, T_2));
1924       _redefined(_mov(T_2, Zero));
1925     } break;
1926     case InstArithmetic::Lshr: {
1927       // a=b>>c (unsigned) ==>
1928       //   t2 = shrd t2, t3, t1
1929       //   t3 = shr t3, t1
1930       //   test t1, 0x20
1931       //   je L1
1932       //   use(t2)
1933       //   t2 = t3
1934       //   t3 = 0
1935       _shrd(T_2, T_3, T_1);
1936       _shr(T_3, T_1);
1937       _test(T_1, BitTest);
1938       _br(Traits::Cond::Br_e, Label);
1939       // T_2 and T_3 are being assigned again because of the intra-block control
1940       // flow, so we need to use _redefined to avoid liveness problems.
1941       _redefined(_mov(T_2, T_3));
1942       _redefined(_mov(T_3, Zero));
1943     } break;
1944     case InstArithmetic::Ashr: {
1945       // a=b>>c (signed) ==>
1946       //   t2 = shrd t2, t3, t1
1947       //   t3 = sar t3, t1
1948       //   test t1, 0x20
1949       //   je L1
1950       //   use(t2)
1951       //   t2 = t3
1952       //   t3 = sar t3, 0x1f
1953       Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1954       _shrd(T_2, T_3, T_1);
1955       _sar(T_3, T_1);
1956       _test(T_1, BitTest);
1957       _br(Traits::Cond::Br_e, Label);
1958       // T_2 and T_3 are being assigned again because of the intra-block control
1959       // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
1960       // doesn't need special treatment because it is reassigned via _sar
1961       // instead of _mov.
1962       _redefined(_mov(T_2, T_3));
1963       _sar(T_3, SignExtend);
1964     } break;
1965     }
1966     // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1967     // L1:
1968     //   a.lo = t2
1969     //   a.hi = t3
1970     Context.insert(Label);
1971     _mov(DestLo, T_2);
1972     _mov(DestHi, T_3);
1973   }
1974 }
1975 
1976 template <typename TraitsType>
1977 void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) {
1978   Variable *Dest = Instr->getDest();
1979   if (Dest->isRematerializable()) {
1980     Context.insert<InstFakeDef>(Dest);
1981     return;
1982   }
1983   Type Ty = Dest->getType();
1984   Operand *Src0 = legalize(Instr->getSrc(0));
1985   Operand *Src1 = legalize(Instr->getSrc(1));
1986   if (Instr->isCommutative()) {
1987     uint32_t SwapCount = 0;
1988     if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
1989       std::swap(Src0, Src1);
1990       ++SwapCount;
1991     }
1992     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
1993       std::swap(Src0, Src1);
1994       ++SwapCount;
1995     }
1996     // Improve two-address code patterns by avoiding a copy to the dest
1997     // register when one of the source operands ends its lifetime here.
1998     if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
1999       std::swap(Src0, Src1);
2000       ++SwapCount;
2001     }
2002     assert(SwapCount <= 1);
2003     (void)SwapCount;
2004   }
2005   if (!Traits::Is64Bit && Ty == IceType_i64) {
2006     // These x86-32 helper-call-involved instructions are lowered in this
2007     // separate switch. This is because loOperand() and hiOperand() may insert
2008     // redundant instructions for constant blinding and pooling. Such redundant
2009     // instructions will fail liveness analysis under -Om1 setting. And,
2010     // actually these arguments do not need to be processed with loOperand()
2011     // and hiOperand() to be used.
2012     switch (Instr->getOp()) {
2013     case InstArithmetic::Udiv:
2014     case InstArithmetic::Sdiv:
2015     case InstArithmetic::Urem:
2016     case InstArithmetic::Srem:
2017       llvm::report_fatal_error("Helper call was expected");
2018       return;
2019     default:
2020       break;
2021     }
2022 
2023     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2024     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2025     Operand *Src0Lo = loOperand(Src0);
2026     Operand *Src0Hi = hiOperand(Src0);
2027     Operand *Src1Lo = loOperand(Src1);
2028     Operand *Src1Hi = hiOperand(Src1);
2029     Variable *T_Lo = nullptr, *T_Hi = nullptr;
2030     switch (Instr->getOp()) {
2031     case InstArithmetic::_num:
2032       llvm_unreachable("Unknown arithmetic operator");
2033       break;
2034     case InstArithmetic::Add:
2035       _mov(T_Lo, Src0Lo);
2036       _add(T_Lo, Src1Lo);
2037       _mov(DestLo, T_Lo);
2038       _mov(T_Hi, Src0Hi);
2039       _adc(T_Hi, Src1Hi);
2040       _mov(DestHi, T_Hi);
2041       break;
2042     case InstArithmetic::And:
2043       _mov(T_Lo, Src0Lo);
2044       _and(T_Lo, Src1Lo);
2045       _mov(DestLo, T_Lo);
2046       _mov(T_Hi, Src0Hi);
2047       _and(T_Hi, Src1Hi);
2048       _mov(DestHi, T_Hi);
2049       break;
2050     case InstArithmetic::Or:
2051       _mov(T_Lo, Src0Lo);
2052       _or(T_Lo, Src1Lo);
2053       _mov(DestLo, T_Lo);
2054       _mov(T_Hi, Src0Hi);
2055       _or(T_Hi, Src1Hi);
2056       _mov(DestHi, T_Hi);
2057       break;
2058     case InstArithmetic::Xor:
2059       _mov(T_Lo, Src0Lo);
2060       _xor(T_Lo, Src1Lo);
2061       _mov(DestLo, T_Lo);
2062       _mov(T_Hi, Src0Hi);
2063       _xor(T_Hi, Src1Hi);
2064       _mov(DestHi, T_Hi);
2065       break;
2066     case InstArithmetic::Sub:
2067       _mov(T_Lo, Src0Lo);
2068       _sub(T_Lo, Src1Lo);
2069       _mov(DestLo, T_Lo);
2070       _mov(T_Hi, Src0Hi);
2071       _sbb(T_Hi, Src1Hi);
2072       _mov(DestHi, T_Hi);
2073       break;
2074     case InstArithmetic::Mul: {
2075       Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
2076       Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
2077       Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
2078       // gcc does the following:
2079       // a=b*c ==>
2080       //   t1 = b.hi; t1 *=(imul) c.lo
2081       //   t2 = c.hi; t2 *=(imul) b.lo
2082       //   t3:eax = b.lo
2083       //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
2084       //   a.lo = t4.lo
2085       //   t4.hi += t1
2086       //   t4.hi += t2
2087       //   a.hi = t4.hi
2088       // The mul instruction cannot take an immediate operand.
2089       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
2090       _mov(T_1, Src0Hi);
2091       _imul(T_1, Src1Lo);
2092       _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
2093       _mul(T_4Lo, T_3, Src1Lo);
2094       // The mul instruction produces two dest variables, edx:eax. We create a
2095       // fake definition of edx to account for this.
2096       Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
2097       Context.insert<InstFakeUse>(T_4Hi);
2098       _mov(DestLo, T_4Lo);
2099       _add(T_4Hi, T_1);
2100       _mov(T_2, Src1Hi);
2101       Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
2102       _imul(T_2, Src0Lo);
2103       _add(T_4Hi, T_2);
2104       _mov(DestHi, T_4Hi);
2105     } break;
2106     case InstArithmetic::Shl:
2107     case InstArithmetic::Lshr:
2108     case InstArithmetic::Ashr:
2109       lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
2110       break;
2111     case InstArithmetic::Fadd:
2112     case InstArithmetic::Fsub:
2113     case InstArithmetic::Fmul:
2114     case InstArithmetic::Fdiv:
2115     case InstArithmetic::Frem:
2116       llvm_unreachable("FP instruction with i64 type");
2117       break;
2118     case InstArithmetic::Udiv:
2119     case InstArithmetic::Sdiv:
2120     case InstArithmetic::Urem:
2121     case InstArithmetic::Srem:
2122       llvm_unreachable("Call-helper-involved instruction for i64 type \
2123                        should have already been handled before");
2124       break;
2125     }
2126     return;
2127   }
2128   if (isVectorType(Ty)) {
2129     // TODO: Trap on integer divide and integer modulo by zero. See:
2130     // https://code.google.com/p/nativeclient/issues/detail?id=3899
2131     if (llvm::isa<X86OperandMem>(Src1))
2132       Src1 = legalizeToReg(Src1);
2133     switch (Instr->getOp()) {
2134     case InstArithmetic::_num:
2135       llvm_unreachable("Unknown arithmetic operator");
2136       break;
2137     case InstArithmetic::Add: {
2138       Variable *T = makeReg(Ty);
2139       _movp(T, Src0);
2140       _padd(T, Src1);
2141       _movp(Dest, T);
2142     } break;
2143     case InstArithmetic::And: {
2144       Variable *T = makeReg(Ty);
2145       _movp(T, Src0);
2146       _pand(T, Src1);
2147       _movp(Dest, T);
2148     } break;
2149     case InstArithmetic::Or: {
2150       Variable *T = makeReg(Ty);
2151       _movp(T, Src0);
2152       _por(T, Src1);
2153       _movp(Dest, T);
2154     } break;
2155     case InstArithmetic::Xor: {
2156       Variable *T = makeReg(Ty);
2157       _movp(T, Src0);
2158       _pxor(T, Src1);
2159       _movp(Dest, T);
2160     } break;
2161     case InstArithmetic::Sub: {
2162       Variable *T = makeReg(Ty);
2163       _movp(T, Src0);
2164       _psub(T, Src1);
2165       _movp(Dest, T);
2166     } break;
2167     case InstArithmetic::Mul: {
2168       bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
2169       bool InstructionSetIsValidForPmull =
2170           Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
2171       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
2172         Variable *T = makeReg(Ty);
2173         _movp(T, Src0);
2174         _pmull(T, Src0 == Src1 ? T : Src1);
2175         _movp(Dest, T);
2176       } else if (Ty == IceType_v4i32) {
2177         // Lowering sequence:
2178         // Note: The mask arguments have index 0 on the left.
2179         //
2180         // movups  T1, Src0
2181         // pshufd  T2, Src0, {1,0,3,0}
2182         // pshufd  T3, Src1, {1,0,3,0}
2183         // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
2184         // pmuludq T1, Src1
2185         // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
2186         // pmuludq T2, T3
2187         // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
2188         // shufps  T1, T2, {0,2,0,2}
2189         // pshufd  T4, T1, {0,2,1,3}
2190         // movups  Dest, T4
2191 
2192         // Mask that directs pshufd to create a vector with entries
2193         // Src[1, 0, 3, 0]
2194         constexpr unsigned Constant1030 = 0x31;
2195         Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
2196         // Mask that directs shufps to create a vector with entries
2197         // Dest[0, 2], Src[0, 2]
2198         constexpr unsigned Mask0202 = 0x88;
2199         // Mask that directs pshufd to create a vector with entries
2200         // Src[0, 2, 1, 3]
2201         constexpr unsigned Mask0213 = 0xd8;
2202         Variable *T1 = makeReg(IceType_v4i32);
2203         Variable *T2 = makeReg(IceType_v4i32);
2204         Variable *T3 = makeReg(IceType_v4i32);
2205         Variable *T4 = makeReg(IceType_v4i32);
2206         _movp(T1, Src0);
2207         _pshufd(T2, Src0, Mask1030);
2208         _pshufd(T3, Src1, Mask1030);
2209         _pmuludq(T1, Src1);
2210         _pmuludq(T2, T3);
2211         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
2212         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
2213         _movp(Dest, T4);
2214       } else if (Ty == IceType_v16i8) {
2215         llvm::report_fatal_error("Scalarized operation was expected");
2216       } else {
2217         llvm::report_fatal_error("Invalid vector multiply type");
2218       }
2219     } break;
2220     case InstArithmetic::Shl: {
2221       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2222       Variable *T = makeReg(Ty);
2223       _movp(T, Src0);
2224       _psll(T, Src1);
2225       _movp(Dest, T);
2226     } break;
2227     case InstArithmetic::Lshr: {
2228       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2229       Variable *T = makeReg(Ty);
2230       _movp(T, Src0);
2231       _psrl(T, Src1);
2232       _movp(Dest, T);
2233     } break;
2234     case InstArithmetic::Ashr: {
2235       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2236       Variable *T = makeReg(Ty);
2237       _movp(T, Src0);
2238       _psra(T, Src1);
2239       _movp(Dest, T);
2240     } break;
2241     case InstArithmetic::Udiv:
2242     case InstArithmetic::Urem:
2243     case InstArithmetic::Sdiv:
2244     case InstArithmetic::Srem:
2245       llvm::report_fatal_error("Scalarized operation was expected");
2246       break;
2247     case InstArithmetic::Fadd: {
2248       Variable *T = makeReg(Ty);
2249       _movp(T, Src0);
2250       _addps(T, Src1);
2251       _movp(Dest, T);
2252     } break;
2253     case InstArithmetic::Fsub: {
2254       Variable *T = makeReg(Ty);
2255       _movp(T, Src0);
2256       _subps(T, Src1);
2257       _movp(Dest, T);
2258     } break;
2259     case InstArithmetic::Fmul: {
2260       Variable *T = makeReg(Ty);
2261       _movp(T, Src0);
2262       _mulps(T, Src0 == Src1 ? T : Src1);
2263       _movp(Dest, T);
2264     } break;
2265     case InstArithmetic::Fdiv: {
2266       Variable *T = makeReg(Ty);
2267       _movp(T, Src0);
2268       _divps(T, Src1);
2269       _movp(Dest, T);
2270     } break;
2271     case InstArithmetic::Frem:
2272       llvm::report_fatal_error("Scalarized operation was expected");
2273       break;
2274     }
2275     return;
2276   }
2277   Variable *T_edx = nullptr;
2278   Variable *T = nullptr;
2279   switch (Instr->getOp()) {
2280   case InstArithmetic::_num:
2281     llvm_unreachable("Unknown arithmetic operator");
2282     break;
2283   case InstArithmetic::Add: {
2284     const bool ValidType =
2285         Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit);
2286     auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
2287     const bool ValidKind =
2288         Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
2289                              llvm::isa<ConstantRelocatable>(Const));
2290     if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
2291       auto *Var = legalizeToReg(Src0);
2292       auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const);
2293       T = makeReg(Ty);
2294       _lea(T, _sandbox_mem_reference(Mem));
2295       _mov(Dest, T);
2296       break;
2297     }
2298     _mov(T, Src0);
2299     _add(T, Src1);
2300     _mov(Dest, T);
2301   } break;
2302   case InstArithmetic::And:
2303     _mov(T, Src0);
2304     _and(T, Src1);
2305     _mov(Dest, T);
2306     break;
2307   case InstArithmetic::Or:
2308     _mov(T, Src0);
2309     _or(T, Src1);
2310     _mov(Dest, T);
2311     break;
2312   case InstArithmetic::Xor:
2313     _mov(T, Src0);
2314     _xor(T, Src1);
2315     _mov(Dest, T);
2316     break;
2317   case InstArithmetic::Sub:
2318     _mov(T, Src0);
2319     _sub(T, Src1);
2320     _mov(Dest, T);
2321     break;
2322   case InstArithmetic::Mul:
2323     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2324       if (optimizeScalarMul(Dest, Src0, C->getValue()))
2325         return;
2326     }
2327     // The 8-bit version of imul only allows the form "imul r/m8" where T must
2328     // be in al.
2329     if (isByteSizedArithType(Ty)) {
2330       _mov(T, Src0, Traits::RegisterSet::Reg_al);
2331       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2332       _imul(T, Src0 == Src1 ? T : Src1);
2333       _mov(Dest, T);
2334     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2335       T = makeReg(Ty);
2336       Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
2337       _imul_imm(T, Src0, ImmConst);
2338       _mov(Dest, T);
2339     } else {
2340       _mov(T, Src0);
2341       // No need to legalize Src1 to Reg | Mem because the Imm case is handled
2342       // already by the ConstantInteger32 case above.
2343       _imul(T, Src0 == Src1 ? T : Src1);
2344       _mov(Dest, T);
2345     }
2346     break;
2347   case InstArithmetic::Shl:
2348     _mov(T, Src0);
2349     if (!llvm::isa<ConstantInteger32>(Src1) &&
2350         !llvm::isa<ConstantInteger64>(Src1))
2351       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2352     _shl(T, Src1);
2353     _mov(Dest, T);
2354     break;
2355   case InstArithmetic::Lshr:
2356     _mov(T, Src0);
2357     if (!llvm::isa<ConstantInteger32>(Src1) &&
2358         !llvm::isa<ConstantInteger64>(Src1))
2359       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2360     _shr(T, Src1);
2361     _mov(Dest, T);
2362     break;
2363   case InstArithmetic::Ashr:
2364     _mov(T, Src0);
2365     if (!llvm::isa<ConstantInteger32>(Src1) &&
2366         !llvm::isa<ConstantInteger64>(Src1))
2367       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2368     _sar(T, Src1);
2369     _mov(Dest, T);
2370     break;
2371   case InstArithmetic::Udiv: {
2372     // div and idiv are the few arithmetic operators that do not allow
2373     // immediates as the operand.
2374     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2375     RegNumT Eax;
2376     RegNumT Edx;
2377     switch (Ty) {
2378     default:
2379       llvm::report_fatal_error("Bad type for udiv");
2380     case IceType_i64:
2381       Eax = Traits::getRaxOrDie();
2382       Edx = Traits::getRdxOrDie();
2383       break;
2384     case IceType_i32:
2385       Eax = Traits::RegisterSet::Reg_eax;
2386       Edx = Traits::RegisterSet::Reg_edx;
2387       break;
2388     case IceType_i16:
2389       Eax = Traits::RegisterSet::Reg_ax;
2390       Edx = Traits::RegisterSet::Reg_dx;
2391       break;
2392     case IceType_i8:
2393       Eax = Traits::RegisterSet::Reg_al;
2394       Edx = Traits::RegisterSet::Reg_ah;
2395       break;
2396     }
2397     T_edx = makeReg(Ty, Edx);
2398     _mov(T, Src0, Eax);
2399     _mov(T_edx, Ctx->getConstantZero(Ty));
2400     _div(T_edx, Src1, T);
2401     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2402     _mov(Dest, T);
2403   } break;
2404   case InstArithmetic::Sdiv:
2405     // TODO(stichnot): Enable this after doing better performance and cross
2406     // testing.
2407     if (false && Func->getOptLevel() >= Opt_1) {
2408       // Optimize division by constant power of 2, but not for Om1 or O0, just
2409       // to keep things simple there.
2410       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2411         const int32_t Divisor = C->getValue();
2412         const uint32_t UDivisor = Divisor;
2413         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2414           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2415           // LLVM does the following for dest=src/(1<<log):
2416           //   t=src
2417           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2418           //   shr t,typewidth-log
2419           //   add t,src
2420           //   sar t,log
2421           //   dest=t
2422           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
2423           _mov(T, Src0);
2424           // If for some reason we are dividing by 1, just treat it like an
2425           // assignment.
2426           if (LogDiv > 0) {
2427             // The initial sar is unnecessary when dividing by 2.
2428             if (LogDiv > 1)
2429               _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2430             _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2431             _add(T, Src0);
2432             _sar(T, Ctx->getConstantInt(Ty, LogDiv));
2433           }
2434           _mov(Dest, T);
2435           return;
2436         }
2437       }
2438     }
2439     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2440     switch (Ty) {
2441     default:
2442       llvm::report_fatal_error("Bad type for sdiv");
2443     case IceType_i64:
2444       T_edx = makeReg(Ty, Traits::getRdxOrDie());
2445       _mov(T, Src0, Traits::getRaxOrDie());
2446       break;
2447     case IceType_i32:
2448       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
2449       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
2450       break;
2451     case IceType_i16:
2452       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
2453       _mov(T, Src0, Traits::RegisterSet::Reg_ax);
2454       break;
2455     case IceType_i8:
2456       T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
2457       _mov(T, Src0, Traits::RegisterSet::Reg_al);
2458       break;
2459     }
2460     _cbwdq(T_edx, T);
2461     _idiv(T_edx, Src1, T);
2462     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2463     _mov(Dest, T);
2464     break;
2465   case InstArithmetic::Urem: {
2466     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2467     RegNumT Eax;
2468     RegNumT Edx;
2469     switch (Ty) {
2470     default:
2471       llvm::report_fatal_error("Bad type for urem");
2472     case IceType_i64:
2473       Eax = Traits::getRaxOrDie();
2474       Edx = Traits::getRdxOrDie();
2475       break;
2476     case IceType_i32:
2477       Eax = Traits::RegisterSet::Reg_eax;
2478       Edx = Traits::RegisterSet::Reg_edx;
2479       break;
2480     case IceType_i16:
2481       Eax = Traits::RegisterSet::Reg_ax;
2482       Edx = Traits::RegisterSet::Reg_dx;
2483       break;
2484     case IceType_i8:
2485       Eax = Traits::RegisterSet::Reg_al;
2486       Edx = Traits::RegisterSet::Reg_ah;
2487       break;
2488     }
2489     T_edx = makeReg(Ty, Edx);
2490     _mov(T_edx, Ctx->getConstantZero(Ty));
2491     _mov(T, Src0, Eax);
2492     _div(T, Src1, T_edx);
2493     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2494     if (Ty == IceType_i8) {
2495       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2496       // moved into a general 8-bit register.
2497       auto *T_AhRcvr = makeReg(Ty);
2498       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2499       _mov(T_AhRcvr, T_edx);
2500       T_edx = T_AhRcvr;
2501     }
2502     _mov(Dest, T_edx);
2503   } break;
2504   case InstArithmetic::Srem: {
2505     // TODO(stichnot): Enable this after doing better performance and cross
2506     // testing.
2507     if (false && Func->getOptLevel() >= Opt_1) {
2508       // Optimize mod by constant power of 2, but not for Om1 or O0, just to
2509       // keep things simple there.
2510       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2511         const int32_t Divisor = C->getValue();
2512         const uint32_t UDivisor = Divisor;
2513         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2514           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2515           // LLVM does the following for dest=src%(1<<log):
2516           //   t=src
2517           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2518           //   shr t,typewidth-log
2519           //   add t,src
2520           //   and t, -(1<<log)
2521           //   sub t,src
2522           //   neg t
2523           //   dest=t
2524           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
2525           // If for some reason we are dividing by 1, just assign 0.
2526           if (LogDiv == 0) {
2527             _mov(Dest, Ctx->getConstantZero(Ty));
2528             return;
2529           }
2530           _mov(T, Src0);
2531           // The initial sar is unnecessary when dividing by 2.
2532           if (LogDiv > 1)
2533             _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2534           _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2535           _add(T, Src0);
2536           _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
2537           _sub(T, Src0);
2538           _neg(T);
2539           _mov(Dest, T);
2540           return;
2541         }
2542       }
2543     }
2544     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2545     RegNumT Eax;
2546     RegNumT Edx;
2547     switch (Ty) {
2548     default:
2549       llvm::report_fatal_error("Bad type for srem");
2550     case IceType_i64:
2551       Eax = Traits::getRaxOrDie();
2552       Edx = Traits::getRdxOrDie();
2553       break;
2554     case IceType_i32:
2555       Eax = Traits::RegisterSet::Reg_eax;
2556       Edx = Traits::RegisterSet::Reg_edx;
2557       break;
2558     case IceType_i16:
2559       Eax = Traits::RegisterSet::Reg_ax;
2560       Edx = Traits::RegisterSet::Reg_dx;
2561       break;
2562     case IceType_i8:
2563       Eax = Traits::RegisterSet::Reg_al;
2564       Edx = Traits::RegisterSet::Reg_ah;
2565       break;
2566     }
2567     T_edx = makeReg(Ty, Edx);
2568     _mov(T, Src0, Eax);
2569     _cbwdq(T_edx, T);
2570     _idiv(T, Src1, T_edx);
2571     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2572     if (Ty == IceType_i8) {
2573       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2574       // moved into a general 8-bit register.
2575       auto *T_AhRcvr = makeReg(Ty);
2576       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2577       _mov(T_AhRcvr, T_edx);
2578       T_edx = T_AhRcvr;
2579     }
2580     _mov(Dest, T_edx);
2581   } break;
2582   case InstArithmetic::Fadd:
2583     _mov(T, Src0);
2584     _addss(T, Src1);
2585     _mov(Dest, T);
2586     break;
2587   case InstArithmetic::Fsub:
2588     _mov(T, Src0);
2589     _subss(T, Src1);
2590     _mov(Dest, T);
2591     break;
2592   case InstArithmetic::Fmul:
2593     _mov(T, Src0);
2594     _mulss(T, Src0 == Src1 ? T : Src1);
2595     _mov(Dest, T);
2596     break;
2597   case InstArithmetic::Fdiv:
2598     _mov(T, Src0);
2599     _divss(T, Src1);
2600     _mov(Dest, T);
2601     break;
2602   case InstArithmetic::Frem:
2603     llvm::report_fatal_error("Helper call was expected");
2604     break;
2605   }
2606 }
2607 
2608 template <typename TraitsType>
2609 void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) {
2610   Variable *Dest = Instr->getDest();
2611   if (Dest->isRematerializable()) {
2612     Context.insert<InstFakeDef>(Dest);
2613     return;
2614   }
2615   Operand *Src = Instr->getSrc(0);
2616   assert(Dest->getType() == Src->getType());
2617   lowerMove(Dest, Src, false);
2618 }
2619 
2620 template <typename TraitsType>
2621 void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) {
2622   if (Br->isUnconditional()) {
2623     _br(Br->getTargetUnconditional());
2624     return;
2625   }
2626   Operand *Cond = Br->getCondition();
2627 
2628   // Handle folding opportunities.
2629   if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2630     assert(Producer->isDeleted());
2631     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
2632     default:
2633       break;
2634     case BoolFolding<Traits>::PK_Icmp32:
2635     case BoolFolding<Traits>::PK_Icmp64: {
2636       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
2637       return;
2638     }
2639     case BoolFolding<Traits>::PK_Fcmp: {
2640       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
2641       return;
2642     }
2643     case BoolFolding<Traits>::PK_Arith: {
2644       lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
2645       return;
2646     }
2647     }
2648   }
2649   Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2650   Constant *Zero = Ctx->getConstantZero(IceType_i32);
2651   _cmp(Src0, Zero);
2652   _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
2653 }
2654 
2655 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
2656 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
2657 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
2658   return S0 < S1 ? S1 : S0;
2659 }
2660 
2661 template <typename TraitsType>
2662 void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
2663   // Common x86 calling convention lowering:
2664   //
2665   // * At the point before the call, the stack must be aligned to 16 bytes.
2666   //
2667   // * Non-register arguments are pushed onto the stack in right-to-left order,
2668   // such that the left-most argument ends up on the top of the stack at the
2669   // lowest memory address.
2670   //
2671   // * Stack arguments of vector type are aligned to start at the next highest
2672   // multiple of 16 bytes. Other stack arguments are aligned to the next word
2673   // size boundary (4 or 8 bytes, respectively).
2674   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
2675                                             Traits::X86_STACK_ALIGNMENT_BYTES);
2676 
2677   constexpr SizeT MaxOperands =
2678       constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
2679   using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
2680 
2681   OperandList XmmArgs;
2682   llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
2683   CfgVector<std::pair<const Type, Operand *>> GprArgs;
2684   CfgVector<SizeT> GprArgIndices;
2685   OperandList StackArgs, StackArgLocations;
2686   uint32_t ParameterAreaSizeBytes = 0;
2687 
2688   ParameterAreaSizeBytes += getShadowStoreSize<Traits>();
2689 
2690   // Classify each argument operand according to the location where the argument
2691   // is passed.
2692   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2693     Operand *Arg = Instr->getArg(i);
2694     const Type Ty = Arg->getType();
2695     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2696     assert(typeWidthInBytes(Ty) >= 4);
2697     if (isVectorType(Ty) &&
2698         Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
2699             .hasValue()) {
2700       XmmArgs.push_back(Arg);
2701       XmmArgIndices.push_back(i);
2702     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
2703                Traits::getRegisterForXmmArgNum(
2704                    Traits::getArgIndex(i, XmmArgs.size()))
2705                    .hasValue()) {
2706       XmmArgs.push_back(Arg);
2707       XmmArgIndices.push_back(i);
2708     } else if (isScalarIntegerType(Ty) &&
2709                Traits::getRegisterForGprArgNum(
2710                    Ty, Traits::getArgIndex(i, GprArgs.size()))
2711                    .hasValue()) {
2712       GprArgs.emplace_back(Ty, Arg);
2713       GprArgIndices.push_back(i);
2714     } else {
2715       // Place on stack.
2716       StackArgs.push_back(Arg);
2717       if (isVectorType(Arg->getType())) {
2718         ParameterAreaSizeBytes =
2719             Traits::applyStackAlignment(ParameterAreaSizeBytes);
2720       }
2721       Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
2722       Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2723       StackArgLocations.push_back(
2724           Traits::X86OperandMem::create(Func, Ty, esp, Loc));
2725       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2726     }
2727   }
2728   // Ensure there is enough space for the fstp/movs for floating returns.
2729   Variable *Dest = Instr->getDest();
2730   const Type DestTy = Dest ? Dest->getType() : IceType_void;
2731   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2732     if (isScalarFloatingType(DestTy)) {
2733       ParameterAreaSizeBytes =
2734           std::max(static_cast<size_t>(ParameterAreaSizeBytes),
2735                    typeWidthInBytesOnStack(DestTy));
2736     }
2737   }
2738   // Adjust the parameter area so that the stack is aligned. It is assumed that
2739   // the stack is already aligned at the start of the calling sequence.
2740   ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
2741   assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
2742   // Copy arguments that are passed on the stack to the appropriate stack
2743   // locations.  We make sure legalize() is called on each argument at this
2744   // point, to allow availabilityGet() to work.
2745   for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
2746     lowerStore(
2747         InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
2748   }
2749   // Copy arguments to be passed in registers to the appropriate registers.
2750   for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2751     XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
2752                                Traits::getRegisterForXmmArgNum(
2753                                    Traits::getArgIndex(XmmArgIndices[i], i)));
2754   }
2755   // Materialize moves for arguments passed in GPRs.
2756   for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
2757     const Type SignatureTy = GprArgs[i].first;
2758     Operand *Arg =
2759         legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
2760     GprArgs[i].second = legalizeToReg(
2761         Arg, Traits::getRegisterForGprArgNum(
2762                  Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
2763     assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
2764     assert(SignatureTy == Arg->getType());
2765     (void)SignatureTy;
2766   }
2767   // Generate a FakeUse of register arguments so that they do not get dead code
2768   // eliminated as a result of the FakeKill of scratch registers after the call.
2769   // These need to be right before the call instruction.
2770   for (auto *Arg : XmmArgs) {
2771     Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
2772   }
2773   for (auto &ArgPair : GprArgs) {
2774     Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
2775   }
2776   // Generate the call instruction. Assign its result to a temporary with high
2777   // register allocation weight.
2778   // ReturnReg doubles as ReturnRegLo as necessary.
2779   Variable *ReturnReg = nullptr;
2780   Variable *ReturnRegHi = nullptr;
2781   if (Dest) {
2782     switch (DestTy) {
2783     case IceType_NUM:
2784     case IceType_void:
2785     case IceType_i1:
2786     case IceType_i8:
2787     case IceType_i16:
2788       llvm::report_fatal_error("Invalid Call dest type");
2789       break;
2790     case IceType_i32:
2791       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
2792       break;
2793     case IceType_i64:
2794       if (Traits::Is64Bit) {
2795         ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
2796       } else {
2797         ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
2798         ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
2799       }
2800       break;
2801     case IceType_f32:
2802     case IceType_f64:
2803       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2804         // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
2805         // the fstp instruction.
2806         break;
2807       }
2808     // Fallthrough intended.
2809     case IceType_v4i1:
2810     case IceType_v8i1:
2811     case IceType_v16i1:
2812     case IceType_v16i8:
2813     case IceType_v8i16:
2814     case IceType_v4i32:
2815     case IceType_v4f32:
2816       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
2817       break;
2818     }
2819   }
2820   // Emit the call to the function.
2821   Operand *CallTarget =
2822       legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
2823   size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
2824   Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
2825   // Keep the upper return register live on 32-bit platform.
2826   if (ReturnRegHi)
2827     Context.insert<InstFakeDef>(ReturnRegHi);
2828   // Mark the call as killing all the caller-save registers.
2829   Context.insert<InstFakeKill>(NewCall);
2830   // Handle x86-32 floating point returns.
2831   if (Dest != nullptr && isScalarFloatingType(DestTy) &&
2832       !Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2833     // Special treatment for an FP function which returns its result in st(0).
2834     // If Dest ends up being a physical xmm register, the fstp emit code will
2835     // route st(0) through the space reserved in the function argument area
2836     // we allocated.
2837     _fstp(Dest);
2838     // Create a fake use of Dest in case it actually isn't used, because st(0)
2839     // still needs to be popped.
2840     Context.insert<InstFakeUse>(Dest);
2841   }
2842   // Generate a FakeUse to keep the call live if necessary.
2843   if (Instr->hasSideEffects() && ReturnReg) {
2844     Context.insert<InstFakeUse>(ReturnReg);
2845   }
2846   // Process the return value, if any.
2847   if (Dest == nullptr)
2848     return;
2849   // Assign the result of the call to Dest.  Route it through a temporary so
2850   // that the local register availability peephole can be subsequently used.
2851   Variable *Tmp = nullptr;
2852   if (isVectorType(DestTy)) {
2853     assert(ReturnReg && "Vector type requires a return register");
2854     Tmp = makeReg(DestTy);
2855     _movp(Tmp, ReturnReg);
2856     _movp(Dest, Tmp);
2857   } else if (isScalarFloatingType(DestTy)) {
2858     if (Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2859       assert(ReturnReg && "FP type requires a return register");
2860       _mov(Tmp, ReturnReg);
2861       _mov(Dest, Tmp);
2862     }
2863   } else {
2864     assert(isScalarIntegerType(DestTy));
2865     assert(ReturnReg && "Integer type requires a return register");
2866     if (DestTy == IceType_i64 && !Traits::Is64Bit) {
2867       assert(ReturnRegHi && "64-bit type requires two return registers");
2868       auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
2869       Variable *DestLo = Dest64On32->getLo();
2870       Variable *DestHi = Dest64On32->getHi();
2871       _mov(Tmp, ReturnReg);
2872       _mov(DestLo, Tmp);
2873       Variable *TmpHi = nullptr;
2874       _mov(TmpHi, ReturnRegHi);
2875       _mov(DestHi, TmpHi);
2876     } else {
2877       _mov(Tmp, ReturnReg);
2878       _mov(Dest, Tmp);
2879     }
2880   }
2881 }
2882 
2883 template <typename TraitsType>
2884 void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) {
2885   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2886   InstCast::OpKind CastKind = Instr->getCastKind();
2887   Variable *Dest = Instr->getDest();
2888   Type DestTy = Dest->getType();
2889   switch (CastKind) {
2890   default:
2891     Func->setError("Cast type not supported");
2892     return;
2893   case InstCast::Sext: {
2894     // Src0RM is the source operand legalized to physical register or memory,
2895     // but not immediate, since the relevant x86 native instructions don't
2896     // allow an immediate operand. If the operand is an immediate, we could
2897     // consider computing the strength-reduced result at translation time, but
2898     // we're unlikely to see something like that in the bitcode that the
2899     // optimizer wouldn't have already taken care of.
2900     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2901     if (isVectorType(DestTy)) {
2902       if (DestTy == IceType_v16i8) {
2903         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2904         Variable *OneMask = makeVectorOfOnes(DestTy);
2905         Variable *T = makeReg(DestTy);
2906         _movp(T, Src0RM);
2907         _pand(T, OneMask);
2908         Variable *Zeros = makeVectorOfZeros(DestTy);
2909         _pcmpgt(T, Zeros);
2910         _movp(Dest, T);
2911       } else {
2912         /// width = width(elty) - 1; dest = (src << width) >> width
2913         SizeT ShiftAmount =
2914             Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
2915             1;
2916         Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2917         Variable *T = makeReg(DestTy);
2918         _movp(T, Src0RM);
2919         _psll(T, ShiftConstant);
2920         _psra(T, ShiftConstant);
2921         _movp(Dest, T);
2922       }
2923     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2924       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
2925       Constant *Shift = Ctx->getConstantInt32(31);
2926       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2927       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2928       Variable *T_Lo = makeReg(DestLo->getType());
2929       if (Src0RM->getType() == IceType_i32) {
2930         _mov(T_Lo, Src0RM);
2931       } else if (Src0RM->getType() == IceType_i1) {
2932         _movzx(T_Lo, Src0RM);
2933         _shl(T_Lo, Shift);
2934         _sar(T_Lo, Shift);
2935       } else {
2936         _movsx(T_Lo, Src0RM);
2937       }
2938       _mov(DestLo, T_Lo);
2939       Variable *T_Hi = nullptr;
2940       _mov(T_Hi, T_Lo);
2941       if (Src0RM->getType() != IceType_i1)
2942         // For i1, the sar instruction is already done above.
2943         _sar(T_Hi, Shift);
2944       _mov(DestHi, T_Hi);
2945     } else if (Src0RM->getType() == IceType_i1) {
2946       // t1 = src
2947       // shl t1, dst_bitwidth - 1
2948       // sar t1, dst_bitwidth - 1
2949       // dst = t1
2950       size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
2951       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2952       Variable *T = makeReg(DestTy);
2953       if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
2954         _mov(T, Src0RM);
2955       } else {
2956         // Widen the source using movsx or movzx. (It doesn't matter which one,
2957         // since the following shl/sar overwrite the bits.)
2958         _movzx(T, Src0RM);
2959       }
2960       _shl(T, ShiftAmount);
2961       _sar(T, ShiftAmount);
2962       _mov(Dest, T);
2963     } else {
2964       // t1 = movsx src; dst = t1
2965       Variable *T = makeReg(DestTy);
2966       _movsx(T, Src0RM);
2967       _mov(Dest, T);
2968     }
2969     break;
2970   }
2971   case InstCast::Zext: {
2972     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2973     if (isVectorType(DestTy)) {
2974       // onemask = materialize(1,1,...); dest = onemask & src
2975       Variable *OneMask = makeVectorOfOnes(DestTy);
2976       Variable *T = makeReg(DestTy);
2977       _movp(T, Src0RM);
2978       _pand(T, OneMask);
2979       _movp(Dest, T);
2980     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2981       // t1=movzx src; dst.lo=t1; dst.hi=0
2982       Constant *Zero = Ctx->getConstantZero(IceType_i32);
2983       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2984       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2985       Variable *Tmp = makeReg(DestLo->getType());
2986       if (Src0RM->getType() == IceType_i32) {
2987         _mov(Tmp, Src0RM);
2988       } else {
2989         _movzx(Tmp, Src0RM);
2990       }
2991       _mov(DestLo, Tmp);
2992       _mov(DestHi, Zero);
2993     } else if (Src0RM->getType() == IceType_i1) {
2994       // t = Src0RM; Dest = t
2995       Variable *T = nullptr;
2996       if (DestTy == IceType_i8) {
2997         _mov(T, Src0RM);
2998       } else {
2999         assert(DestTy != IceType_i1);
3000         assert(Traits::Is64Bit || DestTy != IceType_i64);
3001         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
3002         // In x86-64 we need to widen T to 64-bits to ensure that T -- if
3003         // written to the stack (i.e., in -Om1) will be fully zero-extended.
3004         T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
3005         _movzx(T, Src0RM);
3006       }
3007       _mov(Dest, T);
3008     } else {
3009       // t1 = movzx src; dst = t1
3010       Variable *T = makeReg(DestTy);
3011       _movzx(T, Src0RM);
3012       _mov(Dest, T);
3013     }
3014     break;
3015   }
3016   case InstCast::Trunc: {
3017     if (isVectorType(DestTy)) {
3018       // onemask = materialize(1,1,...); dst = src & onemask
3019       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3020       Type Src0Ty = Src0RM->getType();
3021       Variable *OneMask = makeVectorOfOnes(Src0Ty);
3022       Variable *T = makeReg(DestTy);
3023       _movp(T, Src0RM);
3024       _pand(T, OneMask);
3025       _movp(Dest, T);
3026     } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
3027       // Make sure we truncate from and into valid registers.
3028       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
3029       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
3030         Src0 = loOperand(Src0);
3031       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3032       Variable *T = copyToReg8(Src0RM);
3033       if (DestTy == IceType_i1)
3034         _and(T, Ctx->getConstantInt1(1));
3035       _mov(Dest, T);
3036     } else {
3037       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
3038       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
3039         Src0 = loOperand(Src0);
3040       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3041       // t1 = trunc Src0RM; Dest = t1
3042       Variable *T = makeReg(DestTy);
3043       _mov(T, Src0RM);
3044       _mov(Dest, T);
3045     }
3046     break;
3047   }
3048   case InstCast::Fptrunc:
3049   case InstCast::Fpext: {
3050     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3051     // t1 = cvt Src0RM; Dest = t1
3052     Variable *T = makeReg(DestTy);
3053     _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
3054     _mov(Dest, T);
3055     break;
3056   }
3057   case InstCast::Fptosi:
3058     if (isVectorType(DestTy)) {
3059       assert(DestTy == IceType_v4i32);
3060       assert(Instr->getSrc(0)->getType() == IceType_v4f32);
3061       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
3062       Variable *T = makeReg(DestTy);
3063       _cvt(T, Src0R, Traits::Insts::Cvt::Tps2dq);
3064       _movp(Dest, T);
3065     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
3066       llvm::report_fatal_error("Helper call was expected");
3067     } else {
3068       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3069       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
3070       Variable *T_1 = nullptr;
3071       if (Traits::Is64Bit && DestTy == IceType_i64) {
3072         T_1 = makeReg(IceType_i64);
3073       } else {
3074         assert(DestTy != IceType_i64);
3075         T_1 = makeReg(IceType_i32);
3076       }
3077       // cvt() requires its integer argument to be a GPR.
3078       Variable *T_2 = makeReg(DestTy);
3079       if (isByteSizedType(DestTy)) {
3080         assert(T_1->getType() == IceType_i32);
3081         T_1->setRegClass(RCX86_Is32To8);
3082         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
3083       }
3084       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
3085       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
3086       if (DestTy == IceType_i1)
3087         _and(T_2, Ctx->getConstantInt1(1));
3088       _mov(Dest, T_2);
3089     }
3090     break;
3091   case InstCast::Fptoui:
3092     if (isVectorType(DestTy)) {
3093       llvm::report_fatal_error("Helper call was expected");
3094     } else if (DestTy == IceType_i64 ||
3095                (!Traits::Is64Bit && DestTy == IceType_i32)) {
3096       llvm::report_fatal_error("Helper call was expected");
3097     } else {
3098       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3099       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
3100       assert(DestTy != IceType_i64);
3101       Variable *T_1 = nullptr;
3102       if (Traits::Is64Bit && DestTy == IceType_i32) {
3103         T_1 = makeReg(IceType_i64);
3104       } else {
3105         assert(DestTy != IceType_i32);
3106         T_1 = makeReg(IceType_i32);
3107       }
3108       Variable *T_2 = makeReg(DestTy);
3109       if (isByteSizedType(DestTy)) {
3110         assert(T_1->getType() == IceType_i32);
3111         T_1->setRegClass(RCX86_Is32To8);
3112         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
3113       }
3114       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
3115       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
3116       if (DestTy == IceType_i1)
3117         _and(T_2, Ctx->getConstantInt1(1));
3118       _mov(Dest, T_2);
3119     }
3120     break;
3121   case InstCast::Sitofp:
3122     if (isVectorType(DestTy)) {
3123       assert(DestTy == IceType_v4f32);
3124       assert(Instr->getSrc(0)->getType() == IceType_v4i32);
3125       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
3126       Variable *T = makeReg(DestTy);
3127       _cvt(T, Src0R, Traits::Insts::Cvt::Dq2ps);
3128       _movp(Dest, T);
3129     } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
3130       llvm::report_fatal_error("Helper call was expected");
3131     } else {
3132       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3133       // Sign-extend the operand.
3134       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
3135       Variable *T_1 = nullptr;
3136       if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
3137         T_1 = makeReg(IceType_i64);
3138       } else {
3139         assert(Src0RM->getType() != IceType_i64);
3140         T_1 = makeReg(IceType_i32);
3141       }
3142       Variable *T_2 = makeReg(DestTy);
3143       if (Src0RM->getType() == T_1->getType())
3144         _mov(T_1, Src0RM);
3145       else
3146         _movsx(T_1, Src0RM);
3147       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
3148       _mov(Dest, T_2);
3149     }
3150     break;
3151   case InstCast::Uitofp: {
3152     Operand *Src0 = Instr->getSrc(0);
3153     if (isVectorType(Src0->getType())) {
3154       llvm::report_fatal_error("Helper call was expected");
3155     } else if (Src0->getType() == IceType_i64 ||
3156                (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
3157       llvm::report_fatal_error("Helper call was expected");
3158     } else {
3159       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3160       // Zero-extend the operand.
3161       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
3162       Variable *T_1 = nullptr;
3163       if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
3164         T_1 = makeReg(IceType_i64);
3165       } else {
3166         assert(Src0RM->getType() != IceType_i64);
3167         assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
3168         T_1 = makeReg(IceType_i32);
3169       }
3170       Variable *T_2 = makeReg(DestTy);
3171       if (Src0RM->getType() == T_1->getType())
3172         _mov(T_1, Src0RM);
3173       else
3174         _movzx(T_1, Src0RM)->setMustKeep();
3175       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
3176       _mov(Dest, T_2);
3177     }
3178     break;
3179   }
3180   case InstCast::Bitcast: {
3181     Operand *Src0 = Instr->getSrc(0);
3182     if (DestTy == Src0->getType()) {
3183       auto *Assign = InstAssign::create(Func, Dest, Src0);
3184       lowerAssign(Assign);
3185       return;
3186     }
3187     switch (DestTy) {
3188     default:
3189       llvm_unreachable("Unexpected Bitcast dest type");
3190     case IceType_i8: {
3191       llvm::report_fatal_error("Helper call was expected");
3192     } break;
3193     case IceType_i16: {
3194       llvm::report_fatal_error("Helper call was expected");
3195     } break;
3196     case IceType_i32:
3197     case IceType_f32: {
3198       Variable *Src0R = legalizeToReg(Src0);
3199       Variable *T = makeReg(DestTy);
3200       _movd(T, Src0R);
3201       _mov(Dest, T);
3202     } break;
3203     case IceType_i64: {
3204       assert(Src0->getType() == IceType_f64);
3205       if (Traits::Is64Bit) {
3206         Variable *Src0R = legalizeToReg(Src0);
3207         Variable *T = makeReg(IceType_i64);
3208         _movd(T, Src0R);
3209         _mov(Dest, T);
3210       } else {
3211         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3212         // a.i64 = bitcast b.f64 ==>
3213         //   s.f64 = spill b.f64
3214         //   t_lo.i32 = lo(s.f64)
3215         //   a_lo.i32 = t_lo.i32
3216         //   t_hi.i32 = hi(s.f64)
3217         //   a_hi.i32 = t_hi.i32
3218         Operand *SpillLo, *SpillHi;
3219         if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
3220           Variable *Spill = Func->makeVariable(IceType_f64);
3221           Spill->setLinkedTo(Src0Var);
3222           Spill->setMustNotHaveReg();
3223           _movq(Spill, Src0RM);
3224           SpillLo = Traits::VariableSplit::create(Func, Spill,
3225                                                   Traits::VariableSplit::Low);
3226           SpillHi = Traits::VariableSplit::create(Func, Spill,
3227                                                   Traits::VariableSplit::High);
3228         } else {
3229           SpillLo = loOperand(Src0RM);
3230           SpillHi = hiOperand(Src0RM);
3231         }
3232 
3233         auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
3234         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3235         Variable *T_Lo = makeReg(IceType_i32);
3236         Variable *T_Hi = makeReg(IceType_i32);
3237 
3238         _mov(T_Lo, SpillLo);
3239         _mov(DestLo, T_Lo);
3240         _mov(T_Hi, SpillHi);
3241         _mov(DestHi, T_Hi);
3242       }
3243     } break;
3244     case IceType_f64: {
3245       assert(Src0->getType() == IceType_i64);
3246       if (Traits::Is64Bit) {
3247         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3248         Variable *T = makeReg(IceType_f64);
3249         _movd(T, Src0RM);
3250         _mov(Dest, T);
3251       } else {
3252         Src0 = legalize(Src0);
3253         if (llvm::isa<X86OperandMem>(Src0)) {
3254           Variable *T = makeReg(DestTy);
3255           _movq(T, Src0);
3256           _movq(Dest, T);
3257           break;
3258         }
3259         // a.f64 = bitcast b.i64 ==>
3260         //   t_lo.i32 = b_lo.i32
3261         //   FakeDef(s.f64)
3262         //   lo(s.f64) = t_lo.i32
3263         //   t_hi.i32 = b_hi.i32
3264         //   hi(s.f64) = t_hi.i32
3265         //   a.f64 = s.f64
3266         Variable *Spill = Func->makeVariable(IceType_f64);
3267         Spill->setLinkedTo(Dest);
3268         Spill->setMustNotHaveReg();
3269 
3270         Variable *T_Lo = nullptr, *T_Hi = nullptr;
3271         auto *SpillLo = Traits::VariableSplit::create(
3272             Func, Spill, Traits::VariableSplit::Low);
3273         auto *SpillHi = Traits::VariableSplit::create(
3274             Func, Spill, Traits::VariableSplit::High);
3275         _mov(T_Lo, loOperand(Src0));
3276         // Technically, the Spill is defined after the _store happens, but
3277         // SpillLo is considered a "use" of Spill so define Spill before it is
3278         // used.
3279         Context.insert<InstFakeDef>(Spill);
3280         _store(T_Lo, SpillLo);
3281         _mov(T_Hi, hiOperand(Src0));
3282         _store(T_Hi, SpillHi);
3283         _movq(Dest, Spill);
3284       }
3285     } break;
3286     case IceType_v8i1: {
3287       llvm::report_fatal_error("Helper call was expected");
3288     } break;
3289     case IceType_v16i1: {
3290       llvm::report_fatal_error("Helper call was expected");
3291     } break;
3292     case IceType_v8i16:
3293     case IceType_v16i8:
3294     case IceType_v4i32:
3295     case IceType_v4f32: {
3296       if (Src0->getType() == IceType_i32) {
3297         // Bitcast requires equal type sizes, which isn't strictly the case
3298         // between scalars and vectors, but to emulate v4i8 vectors one has to
3299         // use v16i8 vectors.
3300         assert(getFlags().getApplicationBinaryInterface() != ABI_PNaCl &&
3301                "PNaCl only supports real 128-bit vectors");
3302         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3303         Variable *T = makeReg(DestTy);
3304         _movd(T, Src0RM);
3305         _mov(Dest, T);
3306       } else {
3307         _movp(Dest, legalizeToReg(Src0));
3308       }
3309     } break;
3310     }
3311     break;
3312   }
3313   }
3314 }
3315 
3316 template <typename TraitsType>
3317 void TargetX86Base<TraitsType>::lowerExtractElement(
3318     const InstExtractElement *Instr) {
3319   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3320   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
3321   // Only constant indices are allowed in PNaCl IR.
3322   assert(ElementIndex);
3323 
3324   unsigned Index = ElementIndex->getValue();
3325   Type Ty = SourceVectNotLegalized->getType();
3326   Type ElementTy = typeElementType(Ty);
3327   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
3328 
3329   // TODO(wala): Determine the best lowering sequences for each type.
3330   bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
3331                      (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32);
3332   Variable *ExtractedElementR =
3333       makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
3334   if (CanUsePextr) {
3335     // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
3336     // bits of the destination register, so we represent this by always
3337     // extracting into an i32 register.  The _mov into Dest below will do
3338     // truncation as necessary.
3339     Constant *Mask = Ctx->getConstantInt32(Index);
3340     Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
3341     _pextr(ExtractedElementR, SourceVectR, Mask);
3342   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3343     // Use pshufd and movd/movss.
3344     Variable *T = nullptr;
3345     if (Index) {
3346       // The shuffle only needs to occur if the element to be extracted is not
3347       // at the lowest index.
3348       Constant *Mask = Ctx->getConstantInt32(Index);
3349       T = makeReg(Ty);
3350       _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
3351     } else {
3352       T = legalizeToReg(SourceVectNotLegalized);
3353     }
3354 
3355     if (InVectorElementTy == IceType_i32) {
3356       _movd(ExtractedElementR, T);
3357     } else { // Ty == IceType_f32
3358       // TODO(wala): _movss is only used here because _mov does not allow a
3359       // vector source and a scalar destination.  _mov should be able to be
3360       // used here.
3361       // _movss is a binary instruction, so the FakeDef is needed to keep the
3362       // live range analysis consistent.
3363       Context.insert<InstFakeDef>(ExtractedElementR);
3364       _movss(ExtractedElementR, T);
3365     }
3366   } else {
3367     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3368     // Spill the value to a stack slot and do the extraction in memory.
3369     //
3370     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3371     // for legalizing to mem is implemented.
3372     Variable *Slot = Func->makeVariable(Ty);
3373     Slot->setMustNotHaveReg();
3374     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3375 
3376     // Compute the location of the element in memory.
3377     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3378     X86OperandMem *Loc =
3379         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3380     _mov(ExtractedElementR, Loc);
3381   }
3382 
3383   if (ElementTy == IceType_i1) {
3384     // Truncate extracted integers to i1s if necessary.
3385     Variable *T = makeReg(IceType_i1);
3386     InstCast *Cast =
3387         InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
3388     lowerCast(Cast);
3389     ExtractedElementR = T;
3390   }
3391 
3392   // Copy the element to the destination.
3393   Variable *Dest = Instr->getDest();
3394   _mov(Dest, ExtractedElementR);
3395 }
3396 
3397 template <typename TraitsType>
3398 void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) {
3399   Variable *Dest = Fcmp->getDest();
3400 
3401   if (isVectorType(Dest->getType())) {
3402     lowerFcmpVector(Fcmp);
3403   } else {
3404     constexpr Inst *Consumer = nullptr;
3405     lowerFcmpAndConsumer(Fcmp, Consumer);
3406   }
3407 }
3408 
3409 template <typename TraitsType>
3410 void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
3411                                                      const Inst *Consumer) {
3412   Operand *Src0 = Fcmp->getSrc(0);
3413   Operand *Src1 = Fcmp->getSrc(1);
3414   Variable *Dest = Fcmp->getDest();
3415 
3416   if (Consumer != nullptr) {
3417     if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3418       if (lowerOptimizeFcmpSelect(Fcmp, Select))
3419         return;
3420     }
3421   }
3422 
3423   if (isVectorType(Dest->getType())) {
3424     lowerFcmp(Fcmp);
3425     if (Consumer != nullptr)
3426       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3427     return;
3428   }
3429 
3430   // Lowering a = fcmp cond, b, c
3431   //   ucomiss b, c       /* only if C1 != Br_None */
3432   //                      /* but swap b,c order if SwapOperands==true */
3433   //   mov a, <default>
3434   //   j<C1> label        /* only if C1 != Br_None */
3435   //   j<C2> label        /* only if C2 != Br_None */
3436   //   FakeUse(a)         /* only if C1 != Br_None */
3437   //   mov a, !<default>  /* only if C1 != Br_None */
3438   //   label:             /* only if C1 != Br_None */
3439   //
3440   // setcc lowering when C1 != Br_None && C2 == Br_None:
3441   //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
3442   //   setcc a, C1
3443   InstFcmp::FCond Condition = Fcmp->getCondition();
3444   assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
3445   if (Traits::TableFcmp[Condition].SwapScalarOperands)
3446     std::swap(Src0, Src1);
3447   const bool HasC1 = (Traits::TableFcmp[Condition].C1 != Traits::Cond::Br_None);
3448   const bool HasC2 = (Traits::TableFcmp[Condition].C2 != Traits::Cond::Br_None);
3449   if (HasC1) {
3450     Src0 = legalize(Src0);
3451     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3452     Variable *T = nullptr;
3453     _mov(T, Src0);
3454     _ucomiss(T, Src1RM);
3455     if (!HasC2) {
3456       assert(Traits::TableFcmp[Condition].Default);
3457       setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
3458       return;
3459     }
3460   }
3461   int32_t IntDefault = Traits::TableFcmp[Condition].Default;
3462   if (Consumer == nullptr) {
3463     Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
3464     _mov(Dest, Default);
3465     if (HasC1) {
3466       InstX86Label *Label = InstX86Label::create(Func, this);
3467       _br(Traits::TableFcmp[Condition].C1, Label);
3468       if (HasC2) {
3469         _br(Traits::TableFcmp[Condition].C2, Label);
3470       }
3471       Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
3472       _redefined(_mov(Dest, NonDefault));
3473       Context.insert(Label);
3474     }
3475     return;
3476   }
3477   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3478     CfgNode *TrueSucc = Br->getTargetTrue();
3479     CfgNode *FalseSucc = Br->getTargetFalse();
3480     if (IntDefault != 0)
3481       std::swap(TrueSucc, FalseSucc);
3482     if (HasC1) {
3483       _br(Traits::TableFcmp[Condition].C1, FalseSucc);
3484       if (HasC2) {
3485         _br(Traits::TableFcmp[Condition].C2, FalseSucc);
3486       }
3487       _br(TrueSucc);
3488       return;
3489     }
3490     _br(FalseSucc);
3491     return;
3492   }
3493   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3494     Operand *SrcT = Select->getTrueOperand();
3495     Operand *SrcF = Select->getFalseOperand();
3496     Variable *SelectDest = Select->getDest();
3497     if (IntDefault != 0)
3498       std::swap(SrcT, SrcF);
3499     lowerMove(SelectDest, SrcF, false);
3500     if (HasC1) {
3501       InstX86Label *Label = InstX86Label::create(Func, this);
3502       _br(Traits::TableFcmp[Condition].C1, Label);
3503       if (HasC2) {
3504         _br(Traits::TableFcmp[Condition].C2, Label);
3505       }
3506       static constexpr bool IsRedefinition = true;
3507       lowerMove(SelectDest, SrcT, IsRedefinition);
3508       Context.insert(Label);
3509     }
3510     return;
3511   }
3512   llvm::report_fatal_error("Unexpected consumer type");
3513 }
3514 
3515 template <typename TraitsType>
3516 void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) {
3517   Operand *Src0 = Fcmp->getSrc(0);
3518   Operand *Src1 = Fcmp->getSrc(1);
3519   Variable *Dest = Fcmp->getDest();
3520 
3521   if (!isVectorType(Dest->getType()))
3522     llvm::report_fatal_error("Expected vector compare");
3523 
3524   InstFcmp::FCond Condition = Fcmp->getCondition();
3525   assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
3526 
3527   if (Traits::TableFcmp[Condition].SwapVectorOperands)
3528     std::swap(Src0, Src1);
3529 
3530   Variable *T = nullptr;
3531 
3532   if (Condition == InstFcmp::True) {
3533     // makeVectorOfOnes() requires an integer vector type.
3534     T = makeVectorOfMinusOnes(IceType_v4i32);
3535   } else if (Condition == InstFcmp::False) {
3536     T = makeVectorOfZeros(Dest->getType());
3537   } else {
3538     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3539     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3540     if (llvm::isa<X86OperandMem>(Src1RM))
3541       Src1RM = legalizeToReg(Src1RM);
3542 
3543     switch (Condition) {
3544     default: {
3545       const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
3546       assert(Predicate != Traits::Cond::Cmpps_Invalid);
3547       T = makeReg(Src0RM->getType());
3548       _movp(T, Src0RM);
3549       _cmpps(T, Src1RM, Predicate);
3550     } break;
3551     case InstFcmp::One: {
3552       // Check both unequal and ordered.
3553       T = makeReg(Src0RM->getType());
3554       Variable *T2 = makeReg(Src0RM->getType());
3555       _movp(T, Src0RM);
3556       _cmpps(T, Src1RM, Traits::Cond::Cmpps_neq);
3557       _movp(T2, Src0RM);
3558       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_ord);
3559       _pand(T, T2);
3560     } break;
3561     case InstFcmp::Ueq: {
3562       // Check both equal or unordered.
3563       T = makeReg(Src0RM->getType());
3564       Variable *T2 = makeReg(Src0RM->getType());
3565       _movp(T, Src0RM);
3566       _cmpps(T, Src1RM, Traits::Cond::Cmpps_eq);
3567       _movp(T2, Src0RM);
3568       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_unord);
3569       _por(T, T2);
3570     } break;
3571     }
3572   }
3573 
3574   assert(T != nullptr);
3575   _movp(Dest, T);
3576   eliminateNextVectorSextInstruction(Dest);
3577 }
3578 
3579 inline bool isZero(const Operand *Opnd) {
3580   if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
3581     return C64->getValue() == 0;
3582   if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
3583     return C32->getValue() == 0;
3584   return false;
3585 }
3586 
3587 template <typename TraitsType>
3588 void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp,
3589                                                      const Inst *Consumer) {
3590   Operand *Src0 = legalize(Icmp->getSrc(0));
3591   Operand *Src1 = legalize(Icmp->getSrc(1));
3592   Variable *Dest = Icmp->getDest();
3593 
3594   if (isVectorType(Dest->getType())) {
3595     lowerIcmp(Icmp);
3596     if (Consumer != nullptr)
3597       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3598     return;
3599   }
3600 
3601   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
3602     lowerIcmp64(Icmp, Consumer);
3603     return;
3604   }
3605 
3606   // cmp b, c
3607   if (isZero(Src1)) {
3608     switch (Icmp->getCondition()) {
3609     default:
3610       break;
3611     case InstIcmp::Uge:
3612       movOrConsumer(true, Dest, Consumer);
3613       return;
3614     case InstIcmp::Ult:
3615       movOrConsumer(false, Dest, Consumer);
3616       return;
3617     }
3618   }
3619   Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
3620   _cmp(Src0RM, Src1);
3621   setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
3622                   Consumer);
3623 }
3624 
3625 template <typename TraitsType>
3626 void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) {
3627   Operand *Src0 = legalize(Icmp->getSrc(0));
3628   Operand *Src1 = legalize(Icmp->getSrc(1));
3629   Variable *Dest = Icmp->getDest();
3630 
3631   if (!isVectorType(Dest->getType()))
3632     llvm::report_fatal_error("Expected a vector compare");
3633 
3634   Type Ty = Src0->getType();
3635   // Promote i1 vectors to 128 bit integer vector types.
3636   if (typeElementType(Ty) == IceType_i1) {
3637     Type NewTy = IceType_NUM;
3638     switch (Ty) {
3639     default:
3640       llvm::report_fatal_error("unexpected type");
3641       break;
3642     case IceType_v4i1:
3643       NewTy = IceType_v4i32;
3644       break;
3645     case IceType_v8i1:
3646       NewTy = IceType_v8i16;
3647       break;
3648     case IceType_v16i1:
3649       NewTy = IceType_v16i8;
3650       break;
3651     }
3652     Variable *NewSrc0 = Func->makeVariable(NewTy);
3653     Variable *NewSrc1 = Func->makeVariable(NewTy);
3654     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
3655     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
3656     Src0 = NewSrc0;
3657     Src1 = NewSrc1;
3658     Ty = NewTy;
3659   }
3660 
3661   InstIcmp::ICond Condition = Icmp->getCondition();
3662 
3663   Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3664   Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3665 
3666   // SSE2 only has signed comparison operations. Transform unsigned inputs in
3667   // a manner that allows for the use of signed comparison operations by
3668   // flipping the high order bits.
3669   if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
3670       Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
3671     Variable *T0 = makeReg(Ty);
3672     Variable *T1 = makeReg(Ty);
3673     Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
3674     _movp(T0, Src0RM);
3675     _pxor(T0, HighOrderBits);
3676     _movp(T1, Src1RM);
3677     _pxor(T1, HighOrderBits);
3678     Src0RM = T0;
3679     Src1RM = T1;
3680   }
3681 
3682   Variable *T = makeReg(Ty);
3683   switch (Condition) {
3684   default:
3685     llvm_unreachable("unexpected condition");
3686     break;
3687   case InstIcmp::Eq: {
3688     if (llvm::isa<X86OperandMem>(Src1RM))
3689       Src1RM = legalizeToReg(Src1RM);
3690     _movp(T, Src0RM);
3691     _pcmpeq(T, Src1RM);
3692   } break;
3693   case InstIcmp::Ne: {
3694     if (llvm::isa<X86OperandMem>(Src1RM))
3695       Src1RM = legalizeToReg(Src1RM);
3696     _movp(T, Src0RM);
3697     _pcmpeq(T, Src1RM);
3698     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3699     _pxor(T, MinusOne);
3700   } break;
3701   case InstIcmp::Ugt:
3702   case InstIcmp::Sgt: {
3703     if (llvm::isa<X86OperandMem>(Src1RM))
3704       Src1RM = legalizeToReg(Src1RM);
3705     _movp(T, Src0RM);
3706     _pcmpgt(T, Src1RM);
3707   } break;
3708   case InstIcmp::Uge:
3709   case InstIcmp::Sge: {
3710     // !(Src1RM > Src0RM)
3711     if (llvm::isa<X86OperandMem>(Src0RM))
3712       Src0RM = legalizeToReg(Src0RM);
3713     _movp(T, Src1RM);
3714     _pcmpgt(T, Src0RM);
3715     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3716     _pxor(T, MinusOne);
3717   } break;
3718   case InstIcmp::Ult:
3719   case InstIcmp::Slt: {
3720     if (llvm::isa<X86OperandMem>(Src0RM))
3721       Src0RM = legalizeToReg(Src0RM);
3722     _movp(T, Src1RM);
3723     _pcmpgt(T, Src0RM);
3724   } break;
3725   case InstIcmp::Ule:
3726   case InstIcmp::Sle: {
3727     // !(Src0RM > Src1RM)
3728     if (llvm::isa<X86OperandMem>(Src1RM))
3729       Src1RM = legalizeToReg(Src1RM);
3730     _movp(T, Src0RM);
3731     _pcmpgt(T, Src1RM);
3732     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3733     _pxor(T, MinusOne);
3734   } break;
3735   }
3736 
3737   _movp(Dest, T);
3738   eliminateNextVectorSextInstruction(Dest);
3739 }
3740 
3741 template <typename TraitsType>
3742 template <typename T>
3743 typename std::enable_if<!T::Is64Bit, void>::type
3744 TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp,
3745                                        const Inst *Consumer) {
3746   // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
3747   Operand *Src0 = legalize(Icmp->getSrc(0));
3748   Operand *Src1 = legalize(Icmp->getSrc(1));
3749   Variable *Dest = Icmp->getDest();
3750   InstIcmp::ICond Condition = Icmp->getCondition();
3751   assert(static_cast<size_t>(Condition) < Traits::TableIcmp64Size);
3752   Operand *Src0LoRM = nullptr;
3753   Operand *Src0HiRM = nullptr;
3754   // Legalize the portions of Src0 that are going to be needed.
3755   if (isZero(Src1)) {
3756     switch (Condition) {
3757     default:
3758       llvm_unreachable("unexpected condition");
3759       break;
3760     // These two are not optimized, so we fall through to the general case,
3761     // which needs the upper and lower halves legalized.
3762     case InstIcmp::Sgt:
3763     case InstIcmp::Sle:
3764     // These four compare after performing an "or" of the high and low half, so
3765     // they need the upper and lower halves legalized.
3766     case InstIcmp::Eq:
3767     case InstIcmp::Ule:
3768     case InstIcmp::Ne:
3769     case InstIcmp::Ugt:
3770       Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3771     // These two test only the high half's sign bit, so they need only
3772     // the upper half legalized.
3773     case InstIcmp::Sge:
3774     case InstIcmp::Slt:
3775       Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3776       break;
3777 
3778     // These two move constants and hence need no legalization.
3779     case InstIcmp::Uge:
3780     case InstIcmp::Ult:
3781       break;
3782     }
3783   } else {
3784     Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3785     Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3786   }
3787   // Optimize comparisons with zero.
3788   if (isZero(Src1)) {
3789     Constant *SignMask = Ctx->getConstantInt32(0x80000000);
3790     Variable *Temp = nullptr;
3791     switch (Condition) {
3792     default:
3793       llvm_unreachable("unexpected condition");
3794       break;
3795     case InstIcmp::Eq:
3796     case InstIcmp::Ule:
3797       // Mov Src0HiRM first, because it was legalized most recently, and will
3798       // sometimes avoid a move before the OR.
3799       _mov(Temp, Src0HiRM);
3800       _or(Temp, Src0LoRM);
3801       Context.insert<InstFakeUse>(Temp);
3802       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
3803       return;
3804     case InstIcmp::Ne:
3805     case InstIcmp::Ugt:
3806       // Mov Src0HiRM first, because it was legalized most recently, and will
3807       // sometimes avoid a move before the OR.
3808       _mov(Temp, Src0HiRM);
3809       _or(Temp, Src0LoRM);
3810       Context.insert<InstFakeUse>(Temp);
3811       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
3812       return;
3813     case InstIcmp::Uge:
3814       movOrConsumer(true, Dest, Consumer);
3815       return;
3816     case InstIcmp::Ult:
3817       movOrConsumer(false, Dest, Consumer);
3818       return;
3819     case InstIcmp::Sgt:
3820       break;
3821     case InstIcmp::Sge:
3822       _test(Src0HiRM, SignMask);
3823       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
3824       return;
3825     case InstIcmp::Slt:
3826       _test(Src0HiRM, SignMask);
3827       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
3828       return;
3829     case InstIcmp::Sle:
3830       break;
3831     }
3832   }
3833   // Handle general compares.
3834   Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
3835   Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
3836   if (Consumer == nullptr) {
3837     Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
3838     Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
3839     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3840     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3841     _mov(Dest, One);
3842     _cmp(Src0HiRM, Src1HiRI);
3843     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3844       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
3845     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3846       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
3847     _cmp(Src0LoRM, Src1LoRI);
3848     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
3849     Context.insert(LabelFalse);
3850     _redefined(_mov(Dest, Zero));
3851     Context.insert(LabelTrue);
3852     return;
3853   }
3854   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3855     _cmp(Src0HiRM, Src1HiRI);
3856     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3857       _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
3858     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3859       _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
3860     _cmp(Src0LoRM, Src1LoRI);
3861     _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
3862         Br->getTargetFalse());
3863     return;
3864   }
3865   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3866     Operand *SrcT = Select->getTrueOperand();
3867     Operand *SrcF = Select->getFalseOperand();
3868     Variable *SelectDest = Select->getDest();
3869     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3870     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3871     lowerMove(SelectDest, SrcT, false);
3872     _cmp(Src0HiRM, Src1HiRI);
3873     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3874       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
3875     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3876       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
3877     _cmp(Src0LoRM, Src1LoRI);
3878     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
3879     Context.insert(LabelFalse);
3880     static constexpr bool IsRedefinition = true;
3881     lowerMove(SelectDest, SrcF, IsRedefinition);
3882     Context.insert(LabelTrue);
3883     return;
3884   }
3885   llvm::report_fatal_error("Unexpected consumer type");
3886 }
3887 
3888 template <typename TraitsType>
3889 void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition,
3890                                                 Variable *Dest,
3891                                                 const Inst *Consumer) {
3892   if (Consumer == nullptr) {
3893     _setcc(Dest, Condition);
3894     return;
3895   }
3896   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3897     _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
3898     return;
3899   }
3900   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3901     Operand *SrcT = Select->getTrueOperand();
3902     Operand *SrcF = Select->getFalseOperand();
3903     Variable *SelectDest = Select->getDest();
3904     lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
3905     return;
3906   }
3907   llvm::report_fatal_error("Unexpected consumer type");
3908 }
3909 
3910 template <typename TraitsType>
3911 void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest,
3912                                               const Inst *Consumer) {
3913   if (Consumer == nullptr) {
3914     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3915     return;
3916   }
3917   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3918     // TODO(sehr,stichnot): This could be done with a single unconditional
3919     // branch instruction, but subzero doesn't know how to handle the resulting
3920     // control flow graph changes now.  Make it do so to eliminate mov and cmp.
3921     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3922     _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
3923     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3924     return;
3925   }
3926   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3927     Operand *Src = nullptr;
3928     if (IcmpResult) {
3929       Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
3930     } else {
3931       Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
3932     }
3933     Variable *SelectDest = Select->getDest();
3934     lowerMove(SelectDest, Src, false);
3935     return;
3936   }
3937   llvm::report_fatal_error("Unexpected consumer type");
3938 }
3939 
3940 template <typename TraitsType>
3941 void TargetX86Base<TraitsType>::lowerArithAndConsumer(
3942     const InstArithmetic *Arith, const Inst *Consumer) {
3943   Variable *T = nullptr;
3944   Operand *Src0 = legalize(Arith->getSrc(0));
3945   Operand *Src1 = legalize(Arith->getSrc(1));
3946   Variable *Dest = Arith->getDest();
3947   switch (Arith->getOp()) {
3948   default:
3949     llvm_unreachable("arithmetic operator not AND or OR");
3950     break;
3951   case InstArithmetic::And:
3952     _mov(T, Src0);
3953     // Test cannot have an address in the second position.  Since T is
3954     // guaranteed to be a register and Src1 could be a memory load, ensure
3955     // that the second argument is a register.
3956     if (llvm::isa<Constant>(Src1))
3957       _test(T, Src1);
3958     else
3959       _test(Src1, T);
3960     break;
3961   case InstArithmetic::Or:
3962     _mov(T, Src0);
3963     _or(T, Src1);
3964     break;
3965   }
3966 
3967   if (Consumer == nullptr) {
3968     llvm::report_fatal_error("Expected a consumer instruction");
3969   }
3970   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3971     Context.insert<InstFakeUse>(T);
3972     Context.insert<InstFakeDef>(Dest);
3973     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3974     return;
3975   }
3976   llvm::report_fatal_error("Unexpected consumer type");
3977 }
3978 
3979 template <typename TraitsType>
3980 void TargetX86Base<TraitsType>::lowerInsertElement(
3981     const InstInsertElement *Instr) {
3982   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3983   Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
3984   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
3985   // Only constant indices are allowed in PNaCl IR.
3986   assert(ElementIndex);
3987   unsigned Index = ElementIndex->getValue();
3988   assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3989 
3990   Type Ty = SourceVectNotLegalized->getType();
3991   Type ElementTy = typeElementType(Ty);
3992   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
3993 
3994   if (ElementTy == IceType_i1) {
3995     // Expand the element to the appropriate size for it to be inserted in the
3996     // vector.
3997     Variable *Expanded = Func->makeVariable(InVectorElementTy);
3998     auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3999                                   ElementToInsertNotLegalized);
4000     lowerCast(Cast);
4001     ElementToInsertNotLegalized = Expanded;
4002   }
4003 
4004   if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
4005       InstructionSet >= Traits::SSE4_1) {
4006     // Use insertps, pinsrb, pinsrw, or pinsrd.
4007     Operand *ElementRM =
4008         legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
4009     Operand *SourceVectRM =
4010         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
4011     Variable *T = makeReg(Ty);
4012     _movp(T, SourceVectRM);
4013     if (Ty == IceType_v4f32) {
4014       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
4015     } else {
4016       // For the pinsrb and pinsrw instructions, when the source operand is a
4017       // register, it must be a full r32 register like eax, and not ax/al/ah.
4018       // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for
4019       // the use
4020       // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
4021       // validates that the original and base register encodings are the same.
4022       if (ElementRM->getType() == IceType_i8 &&
4023           llvm::isa<Variable>(ElementRM)) {
4024         // Don't use ah/bh/ch/dh for pinsrb.
4025         ElementRM = copyToReg8(ElementRM);
4026       }
4027       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
4028     }
4029     _movp(Instr->getDest(), T);
4030   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
4031     // Use shufps or movss.
4032     Variable *ElementR = nullptr;
4033     Operand *SourceVectRM =
4034         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
4035 
4036     if (InVectorElementTy == IceType_f32) {
4037       // ElementR will be in an XMM register since it is floating point.
4038       ElementR = legalizeToReg(ElementToInsertNotLegalized);
4039     } else {
4040       // Copy an integer to an XMM register.
4041       Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
4042       ElementR = makeReg(Ty);
4043       _movd(ElementR, T);
4044     }
4045 
4046     if (Index == 0) {
4047       Variable *T = makeReg(Ty);
4048       _movp(T, SourceVectRM);
4049       _movss(T, ElementR);
4050       _movp(Instr->getDest(), T);
4051       return;
4052     }
4053 
4054     // shufps treats the source and destination operands as vectors of four
4055     // doublewords. The destination's two high doublewords are selected from
4056     // the source operand and the two low doublewords are selected from the
4057     // (original value of) the destination operand. An insertelement operation
4058     // can be effected with a sequence of two shufps operations with
4059     // appropriate masks. In all cases below, Element[0] is being inserted into
4060     // SourceVectOperand. Indices are ordered from left to right.
4061     //
4062     // insertelement into index 1 (result is stored in ElementR):
4063     //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
4064     //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
4065     //
4066     // insertelement into index 2 (result is stored in T):
4067     //   T := SourceVectRM
4068     //   ElementR := ElementR[0, 0] T[0, 3]
4069     //   T := T[0, 1] ElementR[0, 3]
4070     //
4071     // insertelement into index 3 (result is stored in T):
4072     //   T := SourceVectRM
4073     //   ElementR := ElementR[0, 0] T[0, 2]
4074     //   T := T[0, 1] ElementR[3, 0]
4075     const unsigned char Mask1[3] = {0, 192, 128};
4076     const unsigned char Mask2[3] = {227, 196, 52};
4077 
4078     Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
4079     Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
4080 
4081     if (Index == 1) {
4082       _shufps(ElementR, SourceVectRM, Mask1Constant);
4083       _shufps(ElementR, SourceVectRM, Mask2Constant);
4084       _movp(Instr->getDest(), ElementR);
4085     } else {
4086       Variable *T = makeReg(Ty);
4087       _movp(T, SourceVectRM);
4088       _shufps(ElementR, T, Mask1Constant);
4089       _shufps(T, ElementR, Mask2Constant);
4090       _movp(Instr->getDest(), T);
4091     }
4092   } else {
4093     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
4094     // Spill the value to a stack slot and perform the insertion in memory.
4095     //
4096     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
4097     // for legalizing to mem is implemented.
4098     Variable *Slot = Func->makeVariable(Ty);
4099     Slot->setMustNotHaveReg();
4100     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
4101 
4102     // Compute the location of the position to insert in memory.
4103     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
4104     X86OperandMem *Loc =
4105         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
4106     _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
4107 
4108     Variable *T = makeReg(Ty);
4109     _movp(T, Slot);
4110     _movp(Instr->getDest(), T);
4111   }
4112 }
4113 
4114 template <typename TraitsType>
4115 void TargetX86Base<TraitsType>::lowerIntrinsic(const InstIntrinsic *Instr) {
4116   switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
4117   case Intrinsics::AtomicCmpxchg: {
4118     if (!Intrinsics::isMemoryOrderValid(
4119             ID, getConstantMemoryOrder(Instr->getArg(3)),
4120             getConstantMemoryOrder(Instr->getArg(4)))) {
4121       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
4122       return;
4123     }
4124     Variable *DestPrev = Instr->getDest();
4125     Operand *PtrToMem = legalize(Instr->getArg(0));
4126     Operand *Expected = legalize(Instr->getArg(1));
4127     Operand *Desired = legalize(Instr->getArg(2));
4128     if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
4129       return;
4130     lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
4131     return;
4132   }
4133   case Intrinsics::AtomicFence:
4134     if (!Intrinsics::isMemoryOrderValid(
4135             ID, getConstantMemoryOrder(Instr->getArg(0)))) {
4136       Func->setError("Unexpected memory ordering for AtomicFence");
4137       return;
4138     }
4139     _mfence();
4140     return;
4141   case Intrinsics::AtomicFenceAll:
4142     // NOTE: FenceAll should prevent and load/store from being moved across the
4143     // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
4144     // currently marked coarsely as "HasSideEffects".
4145     _mfence();
4146     return;
4147   case Intrinsics::AtomicIsLockFree: {
4148     // X86 is always lock free for 8/16/32/64 bit accesses.
4149     // TODO(jvoung): Since the result is constant when given a constant byte
4150     // size, this opens up DCE opportunities.
4151     Operand *ByteSize = Instr->getArg(0);
4152     Variable *Dest = Instr->getDest();
4153     if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
4154       Constant *Result;
4155       switch (CI->getValue()) {
4156       default:
4157         // Some x86-64 processors support the cmpxchg16b instruction, which can
4158         // make 16-byte operations lock free (when used with the LOCK prefix).
4159         // However, that's not supported in 32-bit mode, so just return 0 even
4160         // for large sizes.
4161         Result = Ctx->getConstantZero(IceType_i32);
4162         break;
4163       case 1:
4164       case 2:
4165       case 4:
4166       case 8:
4167         Result = Ctx->getConstantInt32(1);
4168         break;
4169       }
4170       _mov(Dest, Result);
4171       return;
4172     }
4173     // The PNaCl ABI requires the byte size to be a compile-time constant.
4174     Func->setError("AtomicIsLockFree byte size should be compile-time const");
4175     return;
4176   }
4177   case Intrinsics::AtomicLoad: {
4178     // We require the memory address to be naturally aligned. Given that is the
4179     // case, then normal loads are atomic.
4180     if (!Intrinsics::isMemoryOrderValid(
4181             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
4182       Func->setError("Unexpected memory ordering for AtomicLoad");
4183       return;
4184     }
4185     Variable *Dest = Instr->getDest();
4186     if (!Traits::Is64Bit) {
4187       if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
4188         // Follow what GCC does and use a movq instead of what lowerLoad()
4189         // normally does (split the load into two). Thus, this skips
4190         // load/arithmetic op folding. Load/arithmetic folding can't happen
4191         // anyway, since this is x86-32 and integer arithmetic only happens on
4192         // 32-bit quantities.
4193         Variable *T = makeReg(IceType_f64);
4194         X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
4195         _movq(T, Addr);
4196         // Then cast the bits back out of the XMM register to the i64 Dest.
4197         auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
4198         lowerCast(Cast);
4199         // Make sure that the atomic load isn't elided when unused.
4200         Context.insert<InstFakeUse>(Dest64On32->getLo());
4201         Context.insert<InstFakeUse>(Dest64On32->getHi());
4202         return;
4203       }
4204     }
4205     auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
4206     lowerLoad(Load);
4207     // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
4208     // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
4209     // the FakeUse on the last-inserted instruction's dest.
4210     Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
4211     return;
4212   }
4213   case Intrinsics::AtomicRMW:
4214     if (!Intrinsics::isMemoryOrderValid(
4215             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
4216       Func->setError("Unexpected memory ordering for AtomicRMW");
4217       return;
4218     }
4219     lowerAtomicRMW(
4220         Instr->getDest(),
4221         static_cast<uint32_t>(
4222             llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
4223         Instr->getArg(1), Instr->getArg(2));
4224     return;
4225   case Intrinsics::AtomicStore: {
4226     if (!Intrinsics::isMemoryOrderValid(
4227             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
4228       Func->setError("Unexpected memory ordering for AtomicStore");
4229       return;
4230     }
4231     // We require the memory address to be naturally aligned. Given that is the
4232     // case, then normal stores are atomic. Add a fence after the store to make
4233     // it visible.
4234     Operand *Value = Instr->getArg(0);
4235     Operand *Ptr = Instr->getArg(1);
4236     if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
4237       // Use a movq instead of what lowerStore() normally does (split the store
4238       // into two), following what GCC does. Cast the bits from int -> to an
4239       // xmm register first.
4240       Variable *T = makeReg(IceType_f64);
4241       auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
4242       lowerCast(Cast);
4243       // Then store XMM w/ a movq.
4244       X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
4245       _storeq(T, Addr);
4246       _mfence();
4247       return;
4248     }
4249     auto *Store = InstStore::create(Func, Value, Ptr);
4250     lowerStore(Store);
4251     _mfence();
4252     return;
4253   }
4254   case Intrinsics::Bswap: {
4255     Variable *Dest = Instr->getDest();
4256     Operand *Val = Instr->getArg(0);
4257     // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
4258     // must be a register. Use rotate left for 16-bit bswap.
4259     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4260       Val = legalizeUndef(Val);
4261       Variable *T_Lo = legalizeToReg(loOperand(Val));
4262       Variable *T_Hi = legalizeToReg(hiOperand(Val));
4263       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4264       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4265       _bswap(T_Lo);
4266       _bswap(T_Hi);
4267       _mov(DestLo, T_Hi);
4268       _mov(DestHi, T_Lo);
4269     } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
4270                Val->getType() == IceType_i32) {
4271       Variable *T = legalizeToReg(Val);
4272       _bswap(T);
4273       _mov(Dest, T);
4274     } else {
4275       assert(Val->getType() == IceType_i16);
4276       Constant *Eight = Ctx->getConstantInt16(8);
4277       Variable *T = nullptr;
4278       Val = legalize(Val);
4279       _mov(T, Val);
4280       _rol(T, Eight);
4281       _mov(Dest, T);
4282     }
4283     return;
4284   }
4285   case Intrinsics::Ctpop: {
4286     Variable *Dest = Instr->getDest();
4287     Variable *T = nullptr;
4288     Operand *Val = Instr->getArg(0);
4289     Type ValTy = Val->getType();
4290     assert(ValTy == IceType_i32 || ValTy == IceType_i64);
4291 
4292     if (!Traits::Is64Bit) {
4293       T = Dest;
4294     } else {
4295       T = makeReg(IceType_i64);
4296       if (ValTy == IceType_i32) {
4297         // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
4298         // converting it to a 64-bit value, and using ctpop_i64. _movzx should
4299         // ensure we will not have any bits set on Val's upper 32 bits.
4300         Variable *V = makeReg(IceType_i64);
4301         Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
4302         _movzx(V, ValRM);
4303         Val = V;
4304       }
4305       ValTy = IceType_i64;
4306     }
4307 
4308     InstCall *Call =
4309         makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
4310                                             : RuntimeHelper::H_call_ctpop_i64,
4311                        T, 1);
4312     Call->addArg(Val);
4313     lowerCall(Call);
4314     // The popcount helpers always return 32-bit values, while the intrinsic's
4315     // signature matches the native POPCNT instruction and fills a 64-bit reg
4316     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
4317     // the user doesn't do that in the IR. If the user does that in the IR,
4318     // then this zero'ing instruction is dead and gets optimized out.
4319     if (!Traits::Is64Bit) {
4320       assert(T == Dest);
4321       if (Val->getType() == IceType_i64) {
4322         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4323         Constant *Zero = Ctx->getConstantZero(IceType_i32);
4324         _mov(DestHi, Zero);
4325       }
4326     } else {
4327       assert(Val->getType() == IceType_i64);
4328       // T is 64 bit. It needs to be copied to dest. We need to:
4329       //
4330       // T_1.32 = trunc T.64 to i32
4331       // T_2.64 = zext T_1.32 to i64
4332       // Dest.<<right_size>> = T_2.<<right_size>>
4333       //
4334       // which ensures the upper 32 bits will always be cleared. Just doing a
4335       //
4336       // mov Dest.32 = trunc T.32 to i32
4337       //
4338       // is dangerous because there's a chance the compiler will optimize this
4339       // copy out. To use _movzx we need two new registers (one 32-, and
4340       // another 64-bit wide.)
4341       Variable *T_1 = makeReg(IceType_i32);
4342       _mov(T_1, T);
4343       Variable *T_2 = makeReg(IceType_i64);
4344       _movzx(T_2, T_1);
4345       _mov(Dest, T_2);
4346     }
4347     return;
4348   }
4349   case Intrinsics::Ctlz: {
4350     // The "is zero undef" parameter is ignored and we always return a
4351     // well-defined value.
4352     Operand *Val = legalize(Instr->getArg(0));
4353     Operand *FirstVal;
4354     Operand *SecondVal = nullptr;
4355     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4356       FirstVal = loOperand(Val);
4357       SecondVal = hiOperand(Val);
4358     } else {
4359       FirstVal = Val;
4360     }
4361     constexpr bool IsCttz = false;
4362     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
4363                     SecondVal);
4364     return;
4365   }
4366   case Intrinsics::Cttz: {
4367     // The "is zero undef" parameter is ignored and we always return a
4368     // well-defined value.
4369     Operand *Val = legalize(Instr->getArg(0));
4370     Operand *FirstVal;
4371     Operand *SecondVal = nullptr;
4372     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4373       FirstVal = hiOperand(Val);
4374       SecondVal = loOperand(Val);
4375     } else {
4376       FirstVal = Val;
4377     }
4378     constexpr bool IsCttz = true;
4379     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
4380                     SecondVal);
4381     return;
4382   }
4383   case Intrinsics::Fabs: {
4384     Operand *Src = legalize(Instr->getArg(0));
4385     Type Ty = Src->getType();
4386     Variable *Dest = Instr->getDest();
4387     Variable *T = makeVectorOfFabsMask(Ty);
4388     // The pand instruction operates on an m128 memory operand, so if Src is an
4389     // f32 or f64, we need to make sure it's in a register.
4390     if (isVectorType(Ty)) {
4391       if (llvm::isa<X86OperandMem>(Src))
4392         Src = legalizeToReg(Src);
4393     } else {
4394       Src = legalizeToReg(Src);
4395     }
4396     _pand(T, Src);
4397     if (isVectorType(Ty))
4398       _movp(Dest, T);
4399     else
4400       _mov(Dest, T);
4401     return;
4402   }
4403   case Intrinsics::Longjmp: {
4404     InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
4405     Call->addArg(Instr->getArg(0));
4406     Call->addArg(Instr->getArg(1));
4407     lowerCall(Call);
4408     return;
4409   }
4410   case Intrinsics::Memcpy: {
4411     lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4412     return;
4413   }
4414   case Intrinsics::Memmove: {
4415     lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4416     return;
4417   }
4418   case Intrinsics::Memset: {
4419     lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4420     return;
4421   }
4422   case Intrinsics::NaClReadTP: {
4423     if (NeedSandboxing) {
4424       Operand *Src =
4425           dispatchToConcrete(&ConcreteTarget::createNaClReadTPSrcOperand);
4426       Variable *Dest = Instr->getDest();
4427       Variable *T = nullptr;
4428       _mov(T, Src);
4429       _mov(Dest, T);
4430     } else {
4431       InstCall *Call =
4432           makeHelperCall(RuntimeHelper::H_call_read_tp, Instr->getDest(), 0);
4433       lowerCall(Call);
4434     }
4435     return;
4436   }
4437   case Intrinsics::Setjmp: {
4438     InstCall *Call =
4439         makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
4440     Call->addArg(Instr->getArg(0));
4441     lowerCall(Call);
4442     return;
4443   }
4444   case Intrinsics::Sqrt: {
4445     assert(isScalarFloatingType(Instr->getDest()->getType()) ||
4446            getFlags().getApplicationBinaryInterface() != ::Ice::ABI_PNaCl);
4447     Operand *Src = legalize(Instr->getArg(0));
4448     Variable *Dest = Instr->getDest();
4449     Variable *T = makeReg(Dest->getType());
4450     _sqrt(T, Src);
4451     if (isVectorType(Dest->getType())) {
4452       _movp(Dest, T);
4453     } else {
4454       _mov(Dest, T);
4455     }
4456     return;
4457   }
4458   case Intrinsics::Stacksave: {
4459     if (!Traits::Is64Bit || !NeedSandboxing) {
4460       Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg(),
4461                                                              Traits::WordType);
4462       Variable *Dest = Instr->getDest();
4463       _mov(Dest, esp);
4464       return;
4465     }
4466     Variable *esp = Func->getTarget()->getPhysicalRegister(
4467         Traits::RegisterSet::Reg_esp, IceType_i32);
4468     Variable *Dest = Instr->getDest();
4469     _mov(Dest, esp);
4470 
4471     return;
4472   }
4473   case Intrinsics::Stackrestore: {
4474     Operand *Src = Instr->getArg(0);
4475     _mov_sp(Src);
4476     return;
4477   }
4478 
4479   case Intrinsics::Trap:
4480     _ud2();
4481     return;
4482   case Intrinsics::LoadSubVector: {
4483     assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
4484            "LoadSubVector second argument must be a constant");
4485     Variable *Dest = Instr->getDest();
4486     Type Ty = Dest->getType();
4487     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
4488     Operand *Addr = Instr->getArg(0);
4489     X86OperandMem *Src = formMemoryOperand(Addr, Ty);
4490     doMockBoundsCheck(Src);
4491 
4492     if (Dest->isRematerializable()) {
4493       Context.insert<InstFakeDef>(Dest);
4494       return;
4495     }
4496 
4497     auto *T = makeReg(Ty);
4498     switch (SubVectorSize->getValue()) {
4499     case 4:
4500       _movd(T, Src);
4501       break;
4502     case 8:
4503       _movq(T, Src);
4504       break;
4505     default:
4506       Func->setError("Unexpected size for LoadSubVector");
4507       return;
4508     }
4509     _movp(Dest, T);
4510     return;
4511   }
4512   case Intrinsics::StoreSubVector: {
4513     assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
4514            "StoreSubVector third argument must be a constant");
4515     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
4516     Operand *Value = Instr->getArg(0);
4517     Operand *Addr = Instr->getArg(1);
4518     X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
4519     doMockBoundsCheck(NewAddr);
4520 
4521     Value = legalizeToReg(Value);
4522 
4523     switch (SubVectorSize->getValue()) {
4524     case 4:
4525       _stored(Value, NewAddr);
4526       break;
4527     case 8:
4528       _storeq(Value, NewAddr);
4529       break;
4530     default:
4531       Func->setError("Unexpected size for StoreSubVector");
4532       return;
4533     }
4534     return;
4535   }
4536   case Intrinsics::VectorPackSigned: {
4537     Operand *Src0 = Instr->getArg(0);
4538     Operand *Src1 = Instr->getArg(1);
4539     Variable *Dest = Instr->getDest();
4540     auto *T = makeReg(Src0->getType());
4541     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4542     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4543     _movp(T, Src0RM);
4544     _packss(T, Src1RM);
4545     _movp(Dest, T);
4546     return;
4547   }
4548   case Intrinsics::VectorPackUnsigned: {
4549     Operand *Src0 = Instr->getArg(0);
4550     Operand *Src1 = Instr->getArg(1);
4551     Variable *Dest = Instr->getDest();
4552     auto *T = makeReg(Src0->getType());
4553     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4554     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4555     _movp(T, Src0RM);
4556     _packus(T, Src1RM);
4557     _movp(Dest, T);
4558     return;
4559   }
4560   case Intrinsics::SignMask: {
4561     Operand *SrcReg = legalizeToReg(Instr->getArg(0));
4562     Variable *Dest = Instr->getDest();
4563     Variable *T = makeReg(IceType_i32);
4564     if (SrcReg->getType() == IceType_v4f32 ||
4565         SrcReg->getType() == IceType_v4i32 ||
4566         SrcReg->getType() == IceType_v16i8) {
4567       _movmsk(T, SrcReg);
4568     } else {
4569       // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
4570       llvm::report_fatal_error("Invalid type for SignMask intrinsic");
4571     }
4572     _mov(Dest, T);
4573     return;
4574   }
4575   case Intrinsics::MultiplyHighSigned: {
4576     Operand *Src0 = Instr->getArg(0);
4577     Operand *Src1 = Instr->getArg(1);
4578     Variable *Dest = Instr->getDest();
4579     auto *T = makeReg(Dest->getType());
4580     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4581     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4582     _movp(T, Src0RM);
4583     _pmulhw(T, Src1RM);
4584     _movp(Dest, T);
4585     return;
4586   }
4587   case Intrinsics::MultiplyHighUnsigned: {
4588     Operand *Src0 = Instr->getArg(0);
4589     Operand *Src1 = Instr->getArg(1);
4590     Variable *Dest = Instr->getDest();
4591     auto *T = makeReg(Dest->getType());
4592     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4593     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4594     _movp(T, Src0RM);
4595     _pmulhuw(T, Src1RM);
4596     _movp(Dest, T);
4597     return;
4598   }
4599   case Intrinsics::MultiplyAddPairs: {
4600     Operand *Src0 = Instr->getArg(0);
4601     Operand *Src1 = Instr->getArg(1);
4602     Variable *Dest = Instr->getDest();
4603     auto *T = makeReg(Dest->getType());
4604     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4605     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4606     _movp(T, Src0RM);
4607     _pmaddwd(T, Src1RM);
4608     _movp(Dest, T);
4609     return;
4610   }
4611   case Intrinsics::AddSaturateSigned: {
4612     Operand *Src0 = Instr->getArg(0);
4613     Operand *Src1 = Instr->getArg(1);
4614     Variable *Dest = Instr->getDest();
4615     auto *T = makeReg(Dest->getType());
4616     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4617     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4618     _movp(T, Src0RM);
4619     _padds(T, Src1RM);
4620     _movp(Dest, T);
4621     return;
4622   }
4623   case Intrinsics::SubtractSaturateSigned: {
4624     Operand *Src0 = Instr->getArg(0);
4625     Operand *Src1 = Instr->getArg(1);
4626     Variable *Dest = Instr->getDest();
4627     auto *T = makeReg(Dest->getType());
4628     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4629     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4630     _movp(T, Src0RM);
4631     _psubs(T, Src1RM);
4632     _movp(Dest, T);
4633     return;
4634   }
4635   case Intrinsics::AddSaturateUnsigned: {
4636     Operand *Src0 = Instr->getArg(0);
4637     Operand *Src1 = Instr->getArg(1);
4638     Variable *Dest = Instr->getDest();
4639     auto *T = makeReg(Dest->getType());
4640     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4641     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4642     _movp(T, Src0RM);
4643     _paddus(T, Src1RM);
4644     _movp(Dest, T);
4645     return;
4646   }
4647   case Intrinsics::SubtractSaturateUnsigned: {
4648     Operand *Src0 = Instr->getArg(0);
4649     Operand *Src1 = Instr->getArg(1);
4650     Variable *Dest = Instr->getDest();
4651     auto *T = makeReg(Dest->getType());
4652     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4653     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4654     _movp(T, Src0RM);
4655     _psubus(T, Src1RM);
4656     _movp(Dest, T);
4657     return;
4658   }
4659   case Intrinsics::Nearbyint: {
4660     Operand *Src = Instr->getArg(0);
4661     Variable *Dest = Instr->getDest();
4662     Type DestTy = Dest->getType();
4663     if (isVectorType(DestTy)) {
4664       assert(DestTy == IceType_v4i32);
4665       assert(Src->getType() == IceType_v4f32);
4666       Operand *Src0R = legalizeToReg(Src);
4667       Variable *T = makeReg(DestTy);
4668       _cvt(T, Src0R, Traits::Insts::Cvt::Ps2dq);
4669       _movp(Dest, T);
4670     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
4671       llvm::report_fatal_error("Helper call was expected");
4672     } else {
4673       Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
4674       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
4675       Variable *T_1 = nullptr;
4676       if (Traits::Is64Bit && DestTy == IceType_i64) {
4677         T_1 = makeReg(IceType_i64);
4678       } else {
4679         assert(DestTy != IceType_i64);
4680         T_1 = makeReg(IceType_i32);
4681       }
4682       // cvt() requires its integer argument to be a GPR.
4683       Variable *T_2 = makeReg(DestTy);
4684       if (isByteSizedType(DestTy)) {
4685         assert(T_1->getType() == IceType_i32);
4686         T_1->setRegClass(RCX86_Is32To8);
4687         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
4688       }
4689       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Ss2si);
4690       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
4691       if (DestTy == IceType_i1)
4692         _and(T_2, Ctx->getConstantInt1(1));
4693       _mov(Dest, T_2);
4694     }
4695     return;
4696   }
4697   case Intrinsics::Round: {
4698     assert(InstructionSet >= Traits::SSE4_1);
4699     Variable *Dest = Instr->getDest();
4700     Operand *Src = Instr->getArg(0);
4701     Operand *Mode = Instr->getArg(1);
4702     assert(llvm::isa<ConstantInteger32>(Mode) &&
4703            "Round last argument must be a constant");
4704     auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
4705     int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
4706     (void)Imm;
4707     assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
4708     auto *T = makeReg(Dest->getType());
4709     _round(T, SrcRM, Mode);
4710     _movp(Dest, T);
4711     return;
4712   }
4713   default: // UnknownIntrinsic
4714     Func->setError("Unexpected intrinsic");
4715     return;
4716   }
4717   return;
4718 }
4719 
4720 template <typename TraitsType>
4721 void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev,
4722                                                    Operand *Ptr,
4723                                                    Operand *Expected,
4724                                                    Operand *Desired) {
4725   Type Ty = Expected->getType();
4726   if (!Traits::Is64Bit && Ty == IceType_i64) {
4727     // Reserve the pre-colored registers first, before adding any more
4728     // infinite-weight variables from formMemoryOperand's legalization.
4729     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
4730     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
4731     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
4732     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
4733     _mov(T_eax, loOperand(Expected));
4734     _mov(T_edx, hiOperand(Expected));
4735     _mov(T_ebx, loOperand(Desired));
4736     _mov(T_ecx, hiOperand(Desired));
4737     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4738     constexpr bool Locked = true;
4739     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4740     auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
4741     auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
4742     _mov(DestLo, T_eax);
4743     _mov(DestHi, T_edx);
4744     return;
4745   }
4746   RegNumT Eax;
4747   switch (Ty) {
4748   default:
4749     llvm::report_fatal_error("Bad type for cmpxchg");
4750   case IceType_i64:
4751     Eax = Traits::getRaxOrDie();
4752     break;
4753   case IceType_i32:
4754     Eax = Traits::RegisterSet::Reg_eax;
4755     break;
4756   case IceType_i16:
4757     Eax = Traits::RegisterSet::Reg_ax;
4758     break;
4759   case IceType_i8:
4760     Eax = Traits::RegisterSet::Reg_al;
4761     break;
4762   }
4763   Variable *T_eax = makeReg(Ty, Eax);
4764   _mov(T_eax, Expected);
4765   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4766   Variable *DesiredReg = legalizeToReg(Desired);
4767   constexpr bool Locked = true;
4768   _cmpxchg(Addr, T_eax, DesiredReg, Locked);
4769   _mov(DestPrev, T_eax);
4770 }
4771 
4772 template <typename TraitsType>
4773 bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest,
4774                                                          Operand *PtrToMem,
4775                                                          Operand *Expected,
4776                                                          Operand *Desired) {
4777   if (Func->getOptLevel() == Opt_m1)
4778     return false;
4779   // Peek ahead a few instructions and see how Dest is used.
4780   // It's very common to have:
4781   //
4782   // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
4783   // [%y_phi = ...] // list of phi stores
4784   // %p = icmp eq i32 %x, %expected
4785   // br i1 %p, label %l1, label %l2
4786   //
4787   // which we can optimize into:
4788   //
4789   // %x = <cmpxchg code>
4790   // [%y_phi = ...] // list of phi stores
4791   // br eq, %l1, %l2
4792   InstList::iterator I = Context.getCur();
4793   // I is currently the InstIntrinsic. Peek past that.
4794   // This assumes that the atomic cmpxchg has not been lowered yet,
4795   // so that the instructions seen in the scan from "Cur" is simple.
4796   assert(llvm::isa<InstIntrinsic>(*I));
4797   Inst *NextInst = Context.getNextInst(I);
4798   if (!NextInst)
4799     return false;
4800   // There might be phi assignments right before the compare+branch, since this
4801   // could be a backward branch for a loop. This placement of assignments is
4802   // determined by placePhiStores().
4803   CfgVector<InstAssign *> PhiAssigns;
4804   while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
4805     if (PhiAssign->getDest() == Dest)
4806       return false;
4807     PhiAssigns.push_back(PhiAssign);
4808     NextInst = Context.getNextInst(I);
4809     if (!NextInst)
4810       return false;
4811   }
4812   if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
4813     if (!(NextCmp->getCondition() == InstIcmp::Eq &&
4814           ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
4815            (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
4816       return false;
4817     }
4818     NextInst = Context.getNextInst(I);
4819     if (!NextInst)
4820       return false;
4821     if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
4822       if (!NextBr->isUnconditional() &&
4823           NextCmp->getDest() == NextBr->getCondition() &&
4824           NextBr->isLastUse(NextCmp->getDest())) {
4825         lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
4826         for (size_t i = 0; i < PhiAssigns.size(); ++i) {
4827           // Lower the phi assignments now, before the branch (same placement
4828           // as before).
4829           InstAssign *PhiAssign = PhiAssigns[i];
4830           PhiAssign->setDeleted();
4831           lowerAssign(PhiAssign);
4832           Context.advanceNext();
4833         }
4834         _br(Traits::Cond::Br_e, NextBr->getTargetTrue(),
4835             NextBr->getTargetFalse());
4836         // Skip over the old compare and branch, by deleting them.
4837         NextCmp->setDeleted();
4838         NextBr->setDeleted();
4839         Context.advanceNext();
4840         Context.advanceNext();
4841         return true;
4842       }
4843     }
4844   }
4845   return false;
4846 }
4847 
4848 template <typename TraitsType>
4849 void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest,
4850                                                uint32_t Operation, Operand *Ptr,
4851                                                Operand *Val) {
4852   bool NeedsCmpxchg = false;
4853   LowerBinOp Op_Lo = nullptr;
4854   LowerBinOp Op_Hi = nullptr;
4855   switch (Operation) {
4856   default:
4857     Func->setError("Unknown AtomicRMW operation");
4858     return;
4859   case Intrinsics::AtomicAdd: {
4860     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4861       // All the fall-through paths must set this to true, but use this
4862       // for asserting.
4863       NeedsCmpxchg = true;
4864       Op_Lo = &TargetX86Base<TraitsType>::_add;
4865       Op_Hi = &TargetX86Base<TraitsType>::_adc;
4866       break;
4867     }
4868     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4869     constexpr bool Locked = true;
4870     Variable *T = nullptr;
4871     _mov(T, Val);
4872     _xadd(Addr, T, Locked);
4873     _mov(Dest, T);
4874     return;
4875   }
4876   case Intrinsics::AtomicSub: {
4877     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4878       NeedsCmpxchg = true;
4879       Op_Lo = &TargetX86Base<TraitsType>::_sub;
4880       Op_Hi = &TargetX86Base<TraitsType>::_sbb;
4881       break;
4882     }
4883     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4884     constexpr bool Locked = true;
4885     Variable *T = nullptr;
4886     _mov(T, Val);
4887     _neg(T);
4888     _xadd(Addr, T, Locked);
4889     _mov(Dest, T);
4890     return;
4891   }
4892   case Intrinsics::AtomicOr:
4893     // TODO(jvoung): If Dest is null or dead, then some of these
4894     // operations do not need an "exchange", but just a locked op.
4895     // That appears to be "worth" it for sub, or, and, and xor.
4896     // xadd is probably fine vs lock add for add, and xchg is fine
4897     // vs an atomic store.
4898     NeedsCmpxchg = true;
4899     Op_Lo = &TargetX86Base<TraitsType>::_or;
4900     Op_Hi = &TargetX86Base<TraitsType>::_or;
4901     break;
4902   case Intrinsics::AtomicAnd:
4903     NeedsCmpxchg = true;
4904     Op_Lo = &TargetX86Base<TraitsType>::_and;
4905     Op_Hi = &TargetX86Base<TraitsType>::_and;
4906     break;
4907   case Intrinsics::AtomicXor:
4908     NeedsCmpxchg = true;
4909     Op_Lo = &TargetX86Base<TraitsType>::_xor;
4910     Op_Hi = &TargetX86Base<TraitsType>::_xor;
4911     break;
4912   case Intrinsics::AtomicExchange:
4913     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4914       NeedsCmpxchg = true;
4915       // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
4916       // just need to be moved to the ecx and ebx registers.
4917       Op_Lo = nullptr;
4918       Op_Hi = nullptr;
4919       break;
4920     }
4921     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4922     Variable *T = nullptr;
4923     _mov(T, Val);
4924     _xchg(Addr, T);
4925     _mov(Dest, T);
4926     return;
4927   }
4928   // Otherwise, we need a cmpxchg loop.
4929   (void)NeedsCmpxchg;
4930   assert(NeedsCmpxchg);
4931   expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
4932 }
4933 
4934 template <typename TraitsType>
4935 void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
4936                                                          LowerBinOp Op_Hi,
4937                                                          Variable *Dest,
4938                                                          Operand *Ptr,
4939                                                          Operand *Val) {
4940   // Expand a more complex RMW operation as a cmpxchg loop:
4941   // For 64-bit:
4942   //   mov     eax, [ptr]
4943   //   mov     edx, [ptr + 4]
4944   // .LABEL:
4945   //   mov     ebx, eax
4946   //   <Op_Lo> ebx, <desired_adj_lo>
4947   //   mov     ecx, edx
4948   //   <Op_Hi> ecx, <desired_adj_hi>
4949   //   lock cmpxchg8b [ptr]
4950   //   jne     .LABEL
4951   //   mov     <dest_lo>, eax
4952   //   mov     <dest_lo>, edx
4953   //
4954   // For 32-bit:
4955   //   mov     eax, [ptr]
4956   // .LABEL:
4957   //   mov     <reg>, eax
4958   //   op      <reg>, [desired_adj]
4959   //   lock cmpxchg [ptr], <reg>
4960   //   jne     .LABEL
4961   //   mov     <dest>, eax
4962   //
4963   // If Op_{Lo,Hi} are nullptr, then just copy the value.
4964   Val = legalize(Val);
4965   Type Ty = Val->getType();
4966   if (!Traits::Is64Bit && Ty == IceType_i64) {
4967     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
4968     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
4969     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4970     _mov(T_eax, loOperand(Addr));
4971     _mov(T_edx, hiOperand(Addr));
4972     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
4973     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
4974     InstX86Label *Label = InstX86Label::create(Func, this);
4975     const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
4976     if (!IsXchg8b) {
4977       Context.insert(Label);
4978       _mov(T_ebx, T_eax);
4979       (this->*Op_Lo)(T_ebx, loOperand(Val));
4980       _mov(T_ecx, T_edx);
4981       (this->*Op_Hi)(T_ecx, hiOperand(Val));
4982     } else {
4983       // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
4984       // It just needs the Val loaded into ebx and ecx.
4985       // That can also be done before the loop.
4986       _mov(T_ebx, loOperand(Val));
4987       _mov(T_ecx, hiOperand(Val));
4988       Context.insert(Label);
4989     }
4990     constexpr bool Locked = true;
4991     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4992     _br(Traits::Cond::Br_ne, Label);
4993     if (!IsXchg8b) {
4994       // If Val is a variable, model the extended live range of Val through
4995       // the end of the loop, since it will be re-used by the loop.
4996       if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4997         auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
4998         auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
4999         Context.insert<InstFakeUse>(ValLo);
5000         Context.insert<InstFakeUse>(ValHi);
5001       }
5002     } else {
5003       // For xchg, the loop is slightly smaller and ebx/ecx are used.
5004       Context.insert<InstFakeUse>(T_ebx);
5005       Context.insert<InstFakeUse>(T_ecx);
5006     }
5007     // The address base (if any) is also reused in the loop.
5008     if (Variable *Base = Addr->getBase())
5009       Context.insert<InstFakeUse>(Base);
5010     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
5011     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
5012     _mov(DestLo, T_eax);
5013     _mov(DestHi, T_edx);
5014     return;
5015   }
5016   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
5017   RegNumT Eax;
5018   switch (Ty) {
5019   default:
5020     llvm::report_fatal_error("Bad type for atomicRMW");
5021   case IceType_i64:
5022     Eax = Traits::getRaxOrDie();
5023     break;
5024   case IceType_i32:
5025     Eax = Traits::RegisterSet::Reg_eax;
5026     break;
5027   case IceType_i16:
5028     Eax = Traits::RegisterSet::Reg_ax;
5029     break;
5030   case IceType_i8:
5031     Eax = Traits::RegisterSet::Reg_al;
5032     break;
5033   }
5034   Variable *T_eax = makeReg(Ty, Eax);
5035   _mov(T_eax, Addr);
5036   auto *Label = Context.insert<InstX86Label>(this);
5037   // We want to pick a different register for T than Eax, so don't use
5038   // _mov(T == nullptr, T_eax).
5039   Variable *T = makeReg(Ty);
5040   _mov(T, T_eax);
5041   (this->*Op_Lo)(T, Val);
5042   constexpr bool Locked = true;
5043   _cmpxchg(Addr, T_eax, T, Locked);
5044   _br(Traits::Cond::Br_ne, Label);
5045   // If Val is a variable, model the extended live range of Val through
5046   // the end of the loop, since it will be re-used by the loop.
5047   if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
5048     Context.insert<InstFakeUse>(ValVar);
5049   }
5050   // The address base (if any) is also reused in the loop.
5051   if (Variable *Base = Addr->getBase())
5052     Context.insert<InstFakeUse>(Base);
5053   _mov(Dest, T_eax);
5054 }
5055 
5056 /// Lowers count {trailing, leading} zeros intrinsic.
5057 ///
5058 /// We could do constant folding here, but that should have
5059 /// been done by the front-end/middle-end optimizations.
5060 template <typename TraitsType>
5061 void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty,
5062                                                 Variable *Dest,
5063                                                 Operand *FirstVal,
5064                                                 Operand *SecondVal) {
5065   // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
5066   // Then the instructions will handle the Val == 0 case much more simply
5067   // and won't require conversion from bit position to number of zeros.
5068   //
5069   // Otherwise:
5070   //   bsr IF_NOT_ZERO, Val
5071   //   mov T_DEST, ((Ty == i32) ? 63 : 127)
5072   //   cmovne T_DEST, IF_NOT_ZERO
5073   //   xor T_DEST, ((Ty == i32) ? 31 : 63)
5074   //   mov DEST, T_DEST
5075   //
5076   // NOTE: T_DEST must be a register because cmov requires its dest to be a
5077   // register. Also, bsf and bsr require their dest to be a register.
5078   //
5079   // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
5080   // E.g., for 000... 00001100, bsr will say that the most significant bit
5081   // set is at position 3, while the number of leading zeros is 28. Xor is
5082   // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
5083   // all-zeros case).
5084   //
5085   // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
5086   // bits are all zero, and compute the result for that case (checking the
5087   // lower 32 bits). Then actually compute the result for the upper bits and
5088   // cmov in the result from the lower computation if the earlier speculation
5089   // was correct.
5090   //
5091   // Cttz, is similar, but uses bsf instead, and doesn't require the xor
5092   // bit position conversion, and the speculation is reversed.
5093 
5094   // TODO(jpp): refactor this method.
5095   assert(Ty == IceType_i32 || Ty == IceType_i64);
5096   const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
5097   Variable *T = makeReg(DestTy);
5098   Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
5099   if (Cttz) {
5100     _bsf(T, FirstValRM);
5101   } else {
5102     _bsr(T, FirstValRM);
5103   }
5104   Variable *T_Dest = makeReg(DestTy);
5105   Constant *_31 = Ctx->getConstantInt32(31);
5106   Constant *_32 = Ctx->getConstantInt(DestTy, 32);
5107   Constant *_63 = Ctx->getConstantInt(DestTy, 63);
5108   Constant *_64 = Ctx->getConstantInt(DestTy, 64);
5109   if (Cttz) {
5110     if (DestTy == IceType_i64) {
5111       _mov(T_Dest, _64);
5112     } else {
5113       _mov(T_Dest, _32);
5114     }
5115   } else {
5116     Constant *_127 = Ctx->getConstantInt(DestTy, 127);
5117     if (DestTy == IceType_i64) {
5118       _mov(T_Dest, _127);
5119     } else {
5120       _mov(T_Dest, _63);
5121     }
5122   }
5123   _cmov(T_Dest, T, Traits::Cond::Br_ne);
5124   if (!Cttz) {
5125     if (DestTy == IceType_i64) {
5126       // Even though there's a _63 available at this point, that constant might
5127       // not be an i32, which will cause the xor emission to fail.
5128       Constant *_63 = Ctx->getConstantInt32(63);
5129       _xor(T_Dest, _63);
5130     } else {
5131       _xor(T_Dest, _31);
5132     }
5133   }
5134   if (Traits::Is64Bit || Ty == IceType_i32) {
5135     _mov(Dest, T_Dest);
5136     return;
5137   }
5138   _add(T_Dest, _32);
5139   auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
5140   auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
5141   // Will be using "test" on this, so we need a registerized variable.
5142   Variable *SecondVar = legalizeToReg(SecondVal);
5143   Variable *T_Dest2 = makeReg(IceType_i32);
5144   if (Cttz) {
5145     _bsf(T_Dest2, SecondVar);
5146   } else {
5147     _bsr(T_Dest2, SecondVar);
5148     _xor(T_Dest2, _31);
5149   }
5150   _test(SecondVar, SecondVar);
5151   _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
5152   _mov(DestLo, T_Dest2);
5153   _mov(DestHi, Ctx->getConstantZero(IceType_i32));
5154 }
5155 
5156 template <typename TraitsType>
5157 void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest,
5158                                           Variable *Base, Constant *Offset) {
5159   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
5160   // legalize Mem properly.
5161   if (Offset)
5162     assert(!llvm::isa<ConstantRelocatable>(Offset));
5163 
5164   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5165 
5166   if (isVectorType(Ty))
5167     _movp(Dest, Mem);
5168   else if (Ty == IceType_f64)
5169     _movq(Dest, Mem);
5170   else
5171     _mov(Dest, Mem);
5172 }
5173 
5174 template <typename TraitsType>
5175 void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value,
5176                                            Variable *Base, Constant *Offset) {
5177   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
5178   // legalize Mem properly.
5179   if (Offset)
5180     assert(!llvm::isa<ConstantRelocatable>(Offset));
5181 
5182   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5183 
5184   if (isVectorType(Ty))
5185     _storep(Value, Mem);
5186   else if (Ty == IceType_f64)
5187     _storeq(Value, Mem);
5188   else
5189     _store(Value, Mem);
5190 }
5191 
5192 template <typename TraitsType>
5193 void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest,
5194                                            Variable *Src, int32_t OffsetAmt) {
5195   Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
5196   // TODO(ascull): this or add nullptr test to _movp, _movq
5197   Variable *Data = makeReg(Ty);
5198 
5199   typedLoad(Ty, Data, Src, Offset);
5200   typedStore(Ty, Data, Dest, Offset);
5201 }
5202 
5203 template <typename TraitsType>
5204 void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src,
5205                                             Operand *Count) {
5206   // There is a load and store for each chunk in the unroll
5207   constexpr uint32_t BytesPerStorep = 16;
5208 
5209   // Check if the operands are constants
5210   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5211   const bool IsCountConst = CountConst != nullptr;
5212   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5213 
5214   if (shouldOptimizeMemIntrins() && IsCountConst &&
5215       CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
5216     // Unlikely, but nothing to do if it does happen
5217     if (CountValue == 0)
5218       return;
5219 
5220     Variable *SrcBase = legalizeToReg(Src);
5221     Variable *DestBase = legalizeToReg(Dest);
5222 
5223     // Find the largest type that can be used and use it as much as possible in
5224     // reverse order. Then handle any remainder with overlapping copies. Since
5225     // the remainder will be at the end, there will be reduced pressure on the
5226     // memory unit as the accesses to the same memory are far apart.
5227     Type Ty = largestTypeInSize(CountValue);
5228     uint32_t TyWidth = typeWidthInBytes(Ty);
5229 
5230     uint32_t RemainingBytes = CountValue;
5231     int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
5232     while (RemainingBytes >= TyWidth) {
5233       copyMemory(Ty, DestBase, SrcBase, Offset);
5234       RemainingBytes -= TyWidth;
5235       Offset -= TyWidth;
5236     }
5237 
5238     if (RemainingBytes == 0)
5239       return;
5240 
5241     // Lower the remaining bytes. Adjust to larger types in order to make use
5242     // of overlaps in the copies.
5243     Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
5244     Offset = CountValue - typeWidthInBytes(LeftOverTy);
5245     copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
5246     return;
5247   }
5248 
5249   // Fall back on a function call
5250   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
5251   Call->addArg(Dest);
5252   Call->addArg(Src);
5253   Call->addArg(Count);
5254   lowerCall(Call);
5255 }
5256 
5257 template <typename TraitsType>
5258 void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src,
5259                                              Operand *Count) {
5260   // There is a load and store for each chunk in the unroll
5261   constexpr uint32_t BytesPerStorep = 16;
5262 
5263   // Check if the operands are constants
5264   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5265   const bool IsCountConst = CountConst != nullptr;
5266   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5267 
5268   if (shouldOptimizeMemIntrins() && IsCountConst &&
5269       CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
5270     // Unlikely, but nothing to do if it does happen
5271     if (CountValue == 0)
5272       return;
5273 
5274     Variable *SrcBase = legalizeToReg(Src);
5275     Variable *DestBase = legalizeToReg(Dest);
5276 
5277     std::tuple<Type, Constant *, Variable *>
5278         Moves[Traits::MEMMOVE_UNROLL_LIMIT];
5279     Constant *Offset;
5280     Variable *Reg;
5281 
5282     // Copy the data into registers as the source and destination could overlap
5283     // so make sure not to clobber the memory. This also means overlapping
5284     // moves can be used as we are taking a safe snapshot of the memory.
5285     Type Ty = largestTypeInSize(CountValue);
5286     uint32_t TyWidth = typeWidthInBytes(Ty);
5287 
5288     uint32_t RemainingBytes = CountValue;
5289     int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
5290     size_t N = 0;
5291     while (RemainingBytes >= TyWidth) {
5292       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
5293       Offset = Ctx->getConstantInt32(OffsetAmt);
5294       Reg = makeReg(Ty);
5295       typedLoad(Ty, Reg, SrcBase, Offset);
5296       RemainingBytes -= TyWidth;
5297       OffsetAmt -= TyWidth;
5298       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
5299     }
5300 
5301     if (RemainingBytes != 0) {
5302       // Lower the remaining bytes. Adjust to larger types in order to make use
5303       // of overlaps in the copies.
5304       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
5305       Ty = firstTypeThatFitsSize(RemainingBytes);
5306       Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
5307       Reg = makeReg(Ty);
5308       typedLoad(Ty, Reg, SrcBase, Offset);
5309       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
5310     }
5311 
5312     // Copy the data out into the destination memory
5313     for (size_t i = 0; i < N; ++i) {
5314       std::tie(Ty, Offset, Reg) = Moves[i];
5315       typedStore(Ty, Reg, DestBase, Offset);
5316     }
5317 
5318     return;
5319   }
5320 
5321   // Fall back on a function call
5322   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
5323   Call->addArg(Dest);
5324   Call->addArg(Src);
5325   Call->addArg(Count);
5326   lowerCall(Call);
5327 }
5328 
5329 template <typename TraitsType>
5330 void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val,
5331                                             Operand *Count) {
5332   constexpr uint32_t BytesPerStorep = 16;
5333   constexpr uint32_t BytesPerStoreq = 8;
5334   constexpr uint32_t BytesPerStorei32 = 4;
5335   assert(Val->getType() == IceType_i8);
5336 
5337   // Check if the operands are constants
5338   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5339   const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
5340   const bool IsCountConst = CountConst != nullptr;
5341   const bool IsValConst = ValConst != nullptr;
5342   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5343   const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
5344 
5345   // Unlikely, but nothing to do if it does happen
5346   if (IsCountConst && CountValue == 0)
5347     return;
5348 
5349   // TODO(ascull): if the count is constant but val is not it would be possible
5350   // to inline by spreading the value across 4 bytes and accessing subregs e.g.
5351   // eax, ax and al.
5352   if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
5353     Variable *Base = nullptr;
5354     Variable *VecReg = nullptr;
5355     const uint32_t MaskValue = (ValValue & 0xff);
5356     const uint32_t SpreadValue =
5357         (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
5358 
5359     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
5360                                                         uint32_t OffsetAmt) {
5361       assert(Base != nullptr);
5362       Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
5363 
5364       // TODO(ascull): is 64-bit better with vector or scalar movq?
5365       auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5366       if (isVectorType(Ty)) {
5367         assert(VecReg != nullptr);
5368         _storep(VecReg, Mem);
5369       } else if (Ty == IceType_f64) {
5370         assert(VecReg != nullptr);
5371         _storeq(VecReg, Mem);
5372       } else {
5373         assert(Ty != IceType_i64);
5374         _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
5375       }
5376     };
5377 
5378     // Find the largest type that can be used and use it as much as possible in
5379     // reverse order. Then handle any remainder with overlapping copies. Since
5380     // the remainder will be at the end, there will be reduces pressure on the
5381     // memory unit as the access to the same memory are far apart.
5382     Type Ty = IceType_void;
5383     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
5384         CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) {
5385       // When the value is zero it can be loaded into a vector register cheaply
5386       // using the xor trick.
5387       Base = legalizeToReg(Dest);
5388       VecReg = makeVectorOfZeros(IceType_v16i8);
5389       Ty = largestTypeInSize(CountValue);
5390     } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) {
5391       // When the value is non-zero or the count is small we can't use vector
5392       // instructions so are limited to 32-bit stores.
5393       Base = legalizeToReg(Dest);
5394       constexpr uint32_t MaxSize = 4;
5395       Ty = largestTypeInSize(CountValue, MaxSize);
5396     }
5397 
5398     if (Base) {
5399       uint32_t TyWidth = typeWidthInBytes(Ty);
5400 
5401       uint32_t RemainingBytes = CountValue;
5402       uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
5403       while (RemainingBytes >= TyWidth) {
5404         lowerSet(Ty, Offset);
5405         RemainingBytes -= TyWidth;
5406         Offset -= TyWidth;
5407       }
5408 
5409       if (RemainingBytes == 0)
5410         return;
5411 
5412       // Lower the remaining bytes. Adjust to larger types in order to make use
5413       // of overlaps in the copies.
5414       Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
5415       Offset = CountValue - typeWidthInBytes(LeftOverTy);
5416       lowerSet(LeftOverTy, Offset);
5417       return;
5418     }
5419   }
5420 
5421   // Fall back on calling the memset function. The value operand needs to be
5422   // extended to a stack slot size because the PNaCl ABI requires arguments to
5423   // be at least 32 bits wide.
5424   Operand *ValExt;
5425   if (IsValConst) {
5426     ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
5427   } else {
5428     Variable *ValExtVar = Func->makeVariable(stackSlotType());
5429     lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
5430     ValExt = ValExtVar;
5431   }
5432   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
5433   Call->addArg(Dest);
5434   Call->addArg(ValExt);
5435   Call->addArg(Count);
5436   lowerCall(Call);
5437 }
5438 
5439 class AddressOptimizer {
5440   AddressOptimizer() = delete;
5441   AddressOptimizer(const AddressOptimizer &) = delete;
5442   AddressOptimizer &operator=(const AddressOptimizer &) = delete;
5443 
5444 public:
5445   explicit AddressOptimizer(const Cfg *Func)
5446       : Func(Func), VMetadata(Func->getVMetadata()) {}
5447 
5448   inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
5449                              int32_t Offset, const Variable *Base,
5450                              const Variable *Index, uint16_t Shift,
5451                              const Inst *Reason) const;
5452 
5453   inline const Inst *matchAssign(Variable **Var,
5454                                  ConstantRelocatable **Relocatable,
5455                                  int32_t *Offset);
5456 
5457   inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
5458                                             uint16_t *Shift);
5459 
5460   inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
5461 
5462   inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
5463                                             const uint16_t Shift,
5464                                             ConstantRelocatable **Relocatable,
5465                                             int32_t *Offset);
5466 
5467 private:
5468   const Cfg *const Func;
5469   const VariablesMetadata *const VMetadata;
5470 
5471   static bool isAdd(const Inst *Instr) {
5472     if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
5473       return (Arith->getOp() == InstArithmetic::Add);
5474     }
5475     return false;
5476   }
5477 };
5478 
5479 void AddressOptimizer::dumpAddressOpt(
5480     const ConstantRelocatable *const Relocatable, int32_t Offset,
5481     const Variable *Base, const Variable *Index, uint16_t Shift,
5482     const Inst *Reason) const {
5483   if (!BuildDefs::dump())
5484     return;
5485   if (!Func->isVerbose(IceV_AddrOpt))
5486     return;
5487   OstreamLocker L(Func->getContext());
5488   Ostream &Str = Func->getContext()->getStrDump();
5489   Str << "Instruction: ";
5490   Reason->dumpDecorated(Func);
5491   Str << "  results in Base=";
5492   if (Base)
5493     Base->dump(Func);
5494   else
5495     Str << "<null>";
5496   Str << ", Index=";
5497   if (Index)
5498     Index->dump(Func);
5499   else
5500     Str << "<null>";
5501   Str << ", Shift=" << Shift << ", Offset=" << Offset
5502       << ", Relocatable=" << Relocatable << "\n";
5503 }
5504 
5505 const Inst *AddressOptimizer::matchAssign(Variable **Var,
5506                                           ConstantRelocatable **Relocatable,
5507                                           int32_t *Offset) {
5508   // Var originates from Var=SrcVar ==> set Var:=SrcVar
5509   if (*Var == nullptr)
5510     return nullptr;
5511   if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
5512     assert(!VMetadata->isMultiDef(*Var));
5513     if (llvm::isa<InstAssign>(VarAssign)) {
5514       Operand *SrcOp = VarAssign->getSrc(0);
5515       assert(SrcOp);
5516       if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
5517         if (!VMetadata->isMultiDef(SrcVar) &&
5518             // TODO: ensure SrcVar stays single-BB
5519             true) {
5520           *Var = SrcVar;
5521           return VarAssign;
5522         }
5523       } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
5524         int32_t MoreOffset = Const->getValue();
5525         if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
5526           return nullptr;
5527         *Var = nullptr;
5528         *Offset += MoreOffset;
5529         return VarAssign;
5530       } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
5531         if (*Relocatable == nullptr) {
5532           // It is always safe to fold a relocatable through assignment -- the
5533           // assignment frees a slot in the address operand that can be used to
5534           // hold the Sandbox Pointer -- if any.
5535           *Var = nullptr;
5536           *Relocatable = AddReloc;
5537           return VarAssign;
5538         }
5539       }
5540     }
5541   }
5542   return nullptr;
5543 }
5544 
5545 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
5546                                                      Variable **Index,
5547                                                      uint16_t *Shift) {
5548   // Index==nullptr && Base is Base=Var1+Var2 ==>
5549   //   set Base=Var1, Index=Var2, Shift=0
5550   if (*Base == nullptr)
5551     return nullptr;
5552   if (*Index != nullptr)
5553     return nullptr;
5554   auto *BaseInst = VMetadata->getSingleDefinition(*Base);
5555   if (BaseInst == nullptr)
5556     return nullptr;
5557   assert(!VMetadata->isMultiDef(*Base));
5558   if (BaseInst->getSrcSize() < 2)
5559     return nullptr;
5560   if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
5561     if (VMetadata->isMultiDef(Var1))
5562       return nullptr;
5563     if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
5564       if (VMetadata->isMultiDef(Var2))
5565         return nullptr;
5566       if (isAdd(BaseInst) &&
5567           // TODO: ensure Var1 and Var2 stay single-BB
5568           true) {
5569         *Base = Var1;
5570         *Index = Var2;
5571         *Shift = 0; // should already have been 0
5572         return BaseInst;
5573       }
5574     }
5575   }
5576   return nullptr;
5577 }
5578 
5579 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
5580                                                 uint16_t *Shift) {
5581   // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
5582   //   Index=Var, Shift+=log2(Const)
5583   if (*Index == nullptr)
5584     return nullptr;
5585   auto *IndexInst = VMetadata->getSingleDefinition(*Index);
5586   if (IndexInst == nullptr)
5587     return nullptr;
5588   assert(!VMetadata->isMultiDef(*Index));
5589 
5590   // When using an unsigned 32-bit array index on x64, it gets zero-extended
5591   // before the shift & add. The explicit zero extension can be eliminated
5592   // because x86 32-bit operations automatically get zero-extended into the
5593   // corresponding 64-bit register.
5594   if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
5595     if (CastInst->getCastKind() == InstCast::Zext) {
5596       if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
5597         if (Var->getType() == IceType_i32 &&
5598             CastInst->getDest()->getType() == IceType_i64) {
5599           IndexInst = VMetadata->getSingleDefinition(Var);
5600         }
5601       }
5602     }
5603   }
5604 
5605   if (IndexInst->getSrcSize() < 2)
5606     return nullptr;
5607   if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
5608     if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
5609       if (auto *Const =
5610               llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
5611         if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
5612           return nullptr;
5613         switch (ArithInst->getOp()) {
5614         default:
5615           return nullptr;
5616         case InstArithmetic::Mul: {
5617           uint32_t Mult = Const->getValue();
5618           uint32_t LogMult;
5619           switch (Mult) {
5620           case 1:
5621             LogMult = 0;
5622             break;
5623           case 2:
5624             LogMult = 1;
5625             break;
5626           case 4:
5627             LogMult = 2;
5628             break;
5629           case 8:
5630             LogMult = 3;
5631             break;
5632           default:
5633             return nullptr;
5634           }
5635           if (*Shift + LogMult <= 3) {
5636             *Index = Var;
5637             *Shift += LogMult;
5638             return IndexInst;
5639           }
5640         }
5641         case InstArithmetic::Shl: {
5642           uint32_t ShiftAmount = Const->getValue();
5643           switch (ShiftAmount) {
5644           case 0:
5645           case 1:
5646           case 2:
5647           case 3:
5648             break;
5649           default:
5650             return nullptr;
5651           }
5652           if (*Shift + ShiftAmount <= 3) {
5653             *Index = Var;
5654             *Shift += ShiftAmount;
5655             return IndexInst;
5656           }
5657         }
5658         }
5659       }
5660     }
5661   }
5662   return nullptr;
5663 }
5664 
5665 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
5666     Variable **IndexOrBase, const uint16_t Shift,
5667     ConstantRelocatable **Relocatable, int32_t *Offset) {
5668   // Base is Base=Var+Const || Base is Base=Const+Var ==>
5669   //   set Base=Var, Offset+=Const
5670   // Base is Base=Var-Const ==>
5671   //   set Base=Var, Offset-=Const
5672   // Index is Index=Var+Const ==>
5673   //   set Index=Var, Offset+=(Const<<Shift)
5674   // Index is Index=Const+Var ==>
5675   //   set Index=Var, Offset+=(Const<<Shift)
5676   // Index is Index=Var-Const ==>
5677   //   set Index=Var, Offset-=(Const<<Shift)
5678   // Treat Index=Var Or Const as Index=Var + Const
5679   //    when Var = Var' << N and log2(Const) <= N
5680   // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
5681 
5682   if (*IndexOrBase == nullptr) {
5683     return nullptr;
5684   }
5685   const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
5686   if (Definition == nullptr) {
5687     return nullptr;
5688   }
5689   assert(!VMetadata->isMultiDef(*IndexOrBase));
5690   if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
5691     switch (ArithInst->getOp()) {
5692     case InstArithmetic::Add:
5693     case InstArithmetic::Sub:
5694     case InstArithmetic::Or:
5695       break;
5696     default:
5697       return nullptr;
5698     }
5699 
5700     Operand *Src0 = ArithInst->getSrc(0);
5701     Operand *Src1 = ArithInst->getSrc(1);
5702     auto *Var0 = llvm::dyn_cast<Variable>(Src0);
5703     auto *Var1 = llvm::dyn_cast<Variable>(Src1);
5704     auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
5705     auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
5706     auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
5707     auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
5708 
5709     bool IsAdd = false;
5710     if (ArithInst->getOp() == InstArithmetic::Or) {
5711       Variable *Var = nullptr;
5712       ConstantInteger32 *Const = nullptr;
5713       if (Var0 && Const1) {
5714         Var = Var0;
5715         Const = Const1;
5716       } else if (Const0 && Var1) {
5717         Var = Var1;
5718         Const = Const0;
5719       } else {
5720         return nullptr;
5721       }
5722       auto *VarDef =
5723           llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
5724       if (VarDef == nullptr)
5725         return nullptr;
5726 
5727       SizeT ZeroesAvailable = 0;
5728       if (VarDef->getOp() == InstArithmetic::Shl) {
5729         if (auto *ConstInt =
5730                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5731           ZeroesAvailable = ConstInt->getValue();
5732         }
5733       } else if (VarDef->getOp() == InstArithmetic::Mul) {
5734         SizeT PowerOfTwo = 0;
5735         if (auto *MultConst =
5736                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
5737           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5738             PowerOfTwo += MultConst->getValue();
5739           }
5740         }
5741         if (auto *MultConst =
5742                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5743           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5744             PowerOfTwo += MultConst->getValue();
5745           }
5746         }
5747         ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
5748       }
5749       SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
5750       if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
5751         return nullptr;
5752       IsAdd = true; // treat it as an add if the above conditions hold
5753     } else {
5754       IsAdd = ArithInst->getOp() == InstArithmetic::Add;
5755     }
5756 
5757     Variable *NewIndexOrBase = nullptr;
5758     int32_t NewOffset = 0;
5759     ConstantRelocatable *NewRelocatable = *Relocatable;
5760     if (Var0 && Var1)
5761       // TODO(sehr): merge base/index splitting into here.
5762       return nullptr;
5763     if (!IsAdd && Var1)
5764       return nullptr;
5765     if (Var0)
5766       NewIndexOrBase = Var0;
5767     else if (Var1)
5768       NewIndexOrBase = Var1;
5769     // Don't know how to add/subtract two relocatables.
5770     if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
5771       return nullptr;
5772     // Don't know how to subtract a relocatable.
5773     if (!IsAdd && Reloc1)
5774       return nullptr;
5775     // Incorporate ConstantRelocatables.
5776     if (Reloc0)
5777       NewRelocatable = Reloc0;
5778     else if (Reloc1)
5779       NewRelocatable = Reloc1;
5780     // Compute the updated constant offset.
5781     if (Const0) {
5782       const int32_t MoreOffset =
5783           IsAdd ? Const0->getValue() : -Const0->getValue();
5784       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5785         return nullptr;
5786       NewOffset += MoreOffset;
5787     }
5788     if (Const1) {
5789       const int32_t MoreOffset =
5790           IsAdd ? Const1->getValue() : -Const1->getValue();
5791       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5792         return nullptr;
5793       NewOffset += MoreOffset;
5794     }
5795     if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
5796       return nullptr;
5797     *IndexOrBase = NewIndexOrBase;
5798     *Offset += (NewOffset << Shift);
5799     // Shift is always zero if this is called with the base
5800     *Relocatable = NewRelocatable;
5801     return Definition;
5802   }
5803   return nullptr;
5804 }
5805 
5806 template <typename TypeTraits>
5807 typename TargetX86Base<TypeTraits>::X86OperandMem *
5808 TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType,
5809                                              Operand *Addr) {
5810   Func->resetCurrentNode();
5811   if (Func->isVerbose(IceV_AddrOpt)) {
5812     OstreamLocker L(Func->getContext());
5813     Ostream &Str = Func->getContext()->getStrDump();
5814     Str << "\nStarting computeAddressOpt for instruction:\n  ";
5815     Instr->dumpDecorated(Func);
5816   }
5817 
5818   OptAddr NewAddr;
5819   NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
5820   if (NewAddr.Base == nullptr)
5821     return nullptr;
5822 
5823   // If the Base has more than one use or is live across multiple blocks, then
5824   // don't go further. Alternatively (?), never consider a transformation that
5825   // would change a variable that is currently *not* live across basic block
5826   // boundaries into one that *is*.
5827   if (!getFlags().getLoopInvariantCodeMotion()) {
5828     // Need multi block address opt when licm is enabled.
5829     // Might make sense to restrict to current node and loop header.
5830     if (Func->getVMetadata()->isMultiBlock(
5831             NewAddr.Base) /* || Base->getUseCount() > 1*/)
5832       return nullptr;
5833   }
5834   AddressOptimizer AddrOpt(Func);
5835   const bool MockBounds = getFlags().getMockBoundsCheck();
5836   const Inst *Reason = nullptr;
5837   bool AddressWasOptimized = false;
5838   // The following unnamed struct identifies the address mode formation steps
5839   // that could potentially create an invalid memory operand (i.e., no free
5840   // slots for RebasePtr.) We add all those variables to this struct so that we
5841   // can use memset() to reset all members to false.
5842   struct {
5843     bool AssignBase = false;
5844     bool AssignIndex = false;
5845     bool OffsetFromBase = false;
5846     bool OffsetFromIndex = false;
5847     bool CombinedBaseIndex = false;
5848   } Skip;
5849   // This points to the boolean in Skip that represents the last folding
5850   // performed. This is used to disable a pattern match that generated an
5851   // invalid address. Without this, the algorithm would never finish.
5852   bool *SkipLastFolding = nullptr;
5853   // NewAddrCheckpoint is used to rollback the address being formed in case an
5854   // invalid address is formed.
5855   OptAddr NewAddrCheckpoint;
5856   Reason = Instr;
5857   do {
5858     if (SandboxingType != ST_None) {
5859       // When sandboxing, we defer the sandboxing of NewAddr to the Concrete
5860       // Target. If our optimization was overly aggressive, then we simply undo
5861       // what the previous iteration did, and set the previous pattern's skip
5862       // bit to true.
5863       if (!legalizeOptAddrForSandbox(&NewAddr)) {
5864         *SkipLastFolding = true;
5865         SkipLastFolding = nullptr;
5866         NewAddr = NewAddrCheckpoint;
5867         Reason = nullptr;
5868       }
5869     }
5870 
5871     if (Reason) {
5872       AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
5873                              NewAddr.Index, NewAddr.Shift, Reason);
5874       AddressWasOptimized = true;
5875       Reason = nullptr;
5876       SkipLastFolding = nullptr;
5877       memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
5878     }
5879 
5880     NewAddrCheckpoint = NewAddr;
5881 
5882     // Update Base and Index to follow through assignments to definitions.
5883     if (!Skip.AssignBase &&
5884         (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
5885                                       &NewAddr.Offset))) {
5886       SkipLastFolding = &Skip.AssignBase;
5887       // Assignments of Base from a Relocatable or ConstantInt32 can result
5888       // in Base becoming nullptr.  To avoid code duplication in this loop we
5889       // prefer that Base be non-nullptr if possible.
5890       if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
5891           NewAddr.Shift == 0) {
5892         std::swap(NewAddr.Base, NewAddr.Index);
5893       }
5894       continue;
5895     }
5896     if (!Skip.AssignBase &&
5897         (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
5898                                       &NewAddr.Offset))) {
5899       SkipLastFolding = &Skip.AssignIndex;
5900       continue;
5901     }
5902 
5903     if (!MockBounds) {
5904       // Transition from:
5905       //   <Relocatable + Offset>(Base) to
5906       //   <Relocatable + Offset>(Base, Index)
5907       if (!Skip.CombinedBaseIndex &&
5908           (Reason = AddrOpt.matchCombinedBaseIndex(
5909                &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
5910         SkipLastFolding = &Skip.CombinedBaseIndex;
5911         continue;
5912       }
5913 
5914       // Recognize multiply/shift and update Shift amount.
5915       // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
5916       //   Index=Var, Shift+=Const
5917       // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
5918       //   Index=Var, Shift+=log2(Const)
5919       if ((Reason =
5920                AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
5921         continue;
5922       }
5923 
5924       // If Shift is zero, the choice of Base and Index was purely arbitrary.
5925       // Recognize multiply/shift and set Shift amount.
5926       // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
5927       //   swap(Index,Base)
5928       // Similar for Base=Const*Var and Base=Var<<Const
5929       if (NewAddr.Shift == 0 &&
5930           (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
5931         std::swap(NewAddr.Base, NewAddr.Index);
5932         continue;
5933       }
5934     }
5935 
5936     // Update Offset to reflect additions/subtractions with constants and
5937     // relocatables.
5938     // TODO: consider overflow issues with respect to Offset.
5939     if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
5940                                      &NewAddr.Base, /*Shift =*/0,
5941                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
5942       SkipLastFolding = &Skip.OffsetFromBase;
5943       continue;
5944     }
5945     if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
5946                                       &NewAddr.Index, NewAddr.Shift,
5947                                       &NewAddr.Relocatable, &NewAddr.Offset))) {
5948       SkipLastFolding = &Skip.OffsetFromIndex;
5949       continue;
5950     }
5951 
5952     break;
5953   } while (Reason);
5954 
5955   if (!AddressWasOptimized) {
5956     return nullptr;
5957   }
5958 
5959   // Undo any addition of RebasePtr.  It will be added back when the mem
5960   // operand is sandboxed.
5961   if (NewAddr.Base == RebasePtr) {
5962     NewAddr.Base = nullptr;
5963   }
5964 
5965   if (NewAddr.Index == RebasePtr) {
5966     NewAddr.Index = nullptr;
5967     NewAddr.Shift = 0;
5968   }
5969 
5970   Constant *OffsetOp = nullptr;
5971   if (NewAddr.Relocatable == nullptr) {
5972     OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
5973   } else {
5974     OffsetOp =
5975         Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
5976                             NewAddr.Relocatable->getName());
5977   }
5978   // Vanilla ICE load instructions should not use the segment registers, and
5979   // computeAddressOpt only works at the level of Variables and Constants, not
5980   // other X86OperandMem, so there should be no mention of segment
5981   // registers there either.
5982   static constexpr auto SegmentReg =
5983       X86OperandMem::SegmentRegisters::DefaultSegment;
5984 
5985   return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
5986                                NewAddr.Index, NewAddr.Shift, SegmentReg);
5987 }
5988 
5989 /// Add a mock bounds check on the memory address before using it as a load or
5990 /// store operand.  The basic idea is that given a memory operand [reg], we
5991 /// would first add bounds-check code something like:
5992 ///
5993 ///   cmp reg, <lb>
5994 ///   jl out_of_line_error
5995 ///   cmp reg, <ub>
5996 ///   jg out_of_line_error
5997 ///
5998 /// In reality, the specific code will depend on how <lb> and <ub> are
5999 /// represented, e.g. an immediate, a global, or a function argument.
6000 ///
6001 /// As such, we need to enforce that the memory operand does not have the form
6002 /// [reg1+reg2], because then there is no simple cmp instruction that would
6003 /// suffice.  However, we consider [reg+offset] to be OK because the offset is
6004 /// usually small, and so <ub> could have a safety buffer built in and then we
6005 /// could instead branch to a custom out_of_line_error that does the precise
6006 /// check and jumps back if it turns out OK.
6007 ///
6008 /// For the purpose of mocking the bounds check, we'll do something like this:
6009 ///
6010 ///   cmp reg, 0
6011 ///   je label
6012 ///   cmp reg, 1
6013 ///   je label
6014 ///   label:
6015 ///
6016 /// Also note that we don't need to add a bounds check to a dereference of a
6017 /// simple global variable address.
6018 template <typename TraitsType>
6019 void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) {
6020   if (!getFlags().getMockBoundsCheck())
6021     return;
6022   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
6023     if (Mem->getIndex()) {
6024       llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
6025     }
6026     Opnd = Mem->getBase();
6027   }
6028   // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
6029   // something else.  We only care if it is Variable.
6030   auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
6031   if (Var == nullptr)
6032     return;
6033   // We use lowerStore() to copy out-args onto the stack.  This creates a memory
6034   // operand with the stack pointer as the base register.  Don't do bounds
6035   // checks on that.
6036   if (Var->getRegNum() == getStackReg())
6037     return;
6038 
6039   auto *Label = InstX86Label::create(Func, this);
6040   _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
6041   _br(Traits::Cond::Br_e, Label);
6042   _cmp(Opnd, Ctx->getConstantInt32(1));
6043   _br(Traits::Cond::Br_e, Label);
6044   Context.insert(Label);
6045 }
6046 
6047 template <typename TraitsType>
6048 void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) {
6049   // A Load instruction can be treated the same as an Assign instruction, after
6050   // the source operand is transformed into an X86OperandMem operand.  Note that
6051   // the address mode optimization already creates an X86OperandMem operand, so
6052   // it doesn't need another level of transformation.
6053   Variable *DestLoad = Load->getDest();
6054   Type Ty = DestLoad->getType();
6055   Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
6056   doMockBoundsCheck(Src0);
6057   auto *Assign = InstAssign::create(Func, DestLoad, Src0);
6058   lowerAssign(Assign);
6059 }
6060 
6061 template <typename TraitsType>
6062 void TargetX86Base<TraitsType>::doAddressOptOther() {
6063   // Inverts some Icmp instructions which helps doAddressOptLoad later.
6064   // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
6065   Inst *Instr = iteratorToInst(Context.getCur());
6066   auto *VMetadata = Func->getVMetadata();
6067   if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
6068     if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
6069         llvm::isa<Constant>(Icmp->getSrc(1)))
6070       return;
6071     auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
6072     if (Var0 == nullptr)
6073       return;
6074     if (!VMetadata->isTracked(Var0))
6075       return;
6076     auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
6077     if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
6078       return;
6079     if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
6080       return;
6081 
6082     auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
6083     if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
6084       auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
6085       if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
6086           llvm::isa<InstLoad>(Op1Def)) {
6087         return; // Both are loads
6088       }
6089     }
6090     Icmp->reverseConditionAndOperands();
6091   }
6092 }
6093 
6094 template <typename TraitsType>
6095 void TargetX86Base<TraitsType>::doAddressOptLoad() {
6096   Inst *Instr = iteratorToInst(Context.getCur());
6097   Operand *Addr = Instr->getSrc(0);
6098   Variable *Dest = Instr->getDest();
6099   if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
6100     Instr->setDeleted();
6101     Context.insert<InstLoad>(Dest, OptAddr);
6102   }
6103 }
6104 
6105 template <typename TraitsType>
6106 void TargetX86Base<TraitsType>::doAddressOptLoadSubVector() {
6107   auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
6108   Operand *Addr = Intrinsic->getArg(0);
6109   Variable *Dest = Intrinsic->getDest();
6110   if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
6111     Intrinsic->setDeleted();
6112     const Ice::Intrinsics::IntrinsicInfo Info = {
6113         Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
6114         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
6115     auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
6116     NewLoad->addArg(OptAddr);
6117     NewLoad->addArg(Intrinsic->getArg(1));
6118   }
6119 }
6120 
6121 template <typename TraitsType>
6122 void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) {
6123   Func->setError("Phi found in regular instruction list");
6124 }
6125 
6126 template <typename TraitsType>
6127 void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
6128   Variable *Reg = nullptr;
6129   if (Instr->hasRetValue()) {
6130     Operand *RetValue = legalize(Instr->getRetValue());
6131     const Type ReturnType = RetValue->getType();
6132     assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
6133            (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
6134     Reg = moveReturnValueToRegister(RetValue, ReturnType);
6135   }
6136   // Add a ret instruction even if sandboxing is enabled, because addEpilog
6137   // explicitly looks for a ret instruction as a marker for where to insert the
6138   // frame removal instructions.
6139   _ret(Reg);
6140   // Add a fake use of esp to make sure esp stays alive for the entire
6141   // function. Otherwise post-call esp adjustments get dead-code eliminated.
6142   keepEspLiveAtExit();
6143 }
6144 
6145 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
6146                                SizeT Index3) {
6147   const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
6148                      ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
6149   assert(Mask < 256);
6150   return Mask;
6151 }
6152 
6153 template <typename TraitsType>
6154 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc(
6155     Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
6156   constexpr SizeT SrcBit = 1 << 2;
6157   assert((Index0 & SrcBit) == (Index1 & SrcBit));
6158   assert((Index0 & SrcBit) == (Index2 & SrcBit));
6159   assert((Index0 & SrcBit) == (Index3 & SrcBit));
6160   (void)SrcBit;
6161 
6162   const Type SrcTy = Src->getType();
6163   auto *T = makeReg(SrcTy);
6164   auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
6165   auto *Mask =
6166       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
6167   _pshufd(T, SrcRM, Mask);
6168   return T;
6169 }
6170 
6171 template <typename TraitsType>
6172 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc(
6173     Operand *Src0, SizeT Index0, SizeT Index1, Operand *Src1, SizeT Index2,
6174     SizeT Index3) {
6175   constexpr SizeT SrcBit = 1 << 2;
6176   assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
6177   assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
6178   (void)SrcBit;
6179 
6180   const Type SrcTy = Src0->getType();
6181   assert(Src1->getType() == SrcTy);
6182   auto *T = makeReg(SrcTy);
6183   auto *Src0R = legalizeToReg(Src0);
6184   auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6185   auto *Mask =
6186       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
6187   _movp(T, Src0R);
6188   _shufps(T, Src1RM, Mask);
6189   return T;
6190 }
6191 
6192 template <typename TraitsType>
6193 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs(
6194     Operand *Src0, SizeT Index0, Operand *Src1, SizeT Index1) {
6195   return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
6196                                            Index1, IGNORE_INDEX);
6197 }
6198 
6199 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
6200                                SizeT Index3) {
6201   constexpr SizeT SrcBit = 1 << 2;
6202   const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
6203   const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
6204   const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
6205   const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
6206   return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
6207 }
6208 
6209 template <typename TraitsType>
6210 GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
6211   GlobalString FuncName = Func->getFunctionName();
6212   const SizeT Id = PshufbMaskCount++;
6213   if (!BuildDefs::dump() || !FuncName.hasStdString()) {
6214     return GlobalString::createWithString(
6215         Ctx,
6216         "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
6217   }
6218   return GlobalString::createWithString(
6219       Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
6220 }
6221 
6222 template <typename TraitsType>
6223 ConstantRelocatable *
6224 TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
6225     int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
6226     int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
6227     int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
6228     int8_t Idx15) {
6229   static constexpr uint8_t NumElements = 16;
6230   const char Initializer[NumElements] = {
6231       Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
6232       Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
6233   };
6234 
6235   static constexpr Type V4VectorType = IceType_v4i32;
6236   const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
6237   auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
6238   GlobalString MaskName = lowerShuffleVector_NewMaskName();
6239   Mask->setIsConstant(true);
6240   Mask->addInitializer(VariableDeclaration::DataInitializer::create(
6241       Func->getGlobalPool(), Initializer, NumElements));
6242   Mask->setName(MaskName);
6243   // Mask needs to be 16-byte aligned, or pshufb will seg fault.
6244   Mask->setAlignment(MaskAlignment);
6245   Func->addGlobal(Mask);
6246 
6247   constexpr RelocOffsetT Offset = 0;
6248   return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
6249 }
6250 
6251 template <typename TraitsType>
6252 void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
6253     Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
6254     int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
6255     int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
6256     int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
6257   const Type DestTy = Dest->getType();
6258   static constexpr bool NotRebased = false;
6259   static constexpr Variable *NoBase = nullptr;
6260   // We use void for the memory operand instead of DestTy because using the
6261   // latter causes a validation failure: the X86 Inst layer complains that
6262   // vector mem operands could be under aligned. Thus, using void we avoid the
6263   // validation error. Note that the mask global declaration is aligned, so it
6264   // can be used as an XMM mem operand.
6265   static constexpr Type MaskType = IceType_void;
6266 #define IDX_IN_SRC(N, S)                                                       \
6267   ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
6268   auto *Mask0M = X86OperandMem::create(
6269       Func, MaskType, NoBase,
6270       lowerShuffleVector_CreatePshufbMask(
6271           IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
6272           IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
6273           IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
6274           IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
6275           IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
6276           IDX_IN_SRC(Idx15, 0)),
6277       NotRebased);
6278 
6279   auto *T0 = makeReg(DestTy);
6280   auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6281   _movp(T0, Src0RM);
6282 
6283   _pshufb(T0, Mask0M);
6284 
6285   if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
6286       Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
6287       Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
6288       Idx15 >= 16) {
6289     auto *Mask1M = X86OperandMem::create(
6290         Func, MaskType, NoBase,
6291         lowerShuffleVector_CreatePshufbMask(
6292             IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
6293             IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
6294             IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
6295             IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
6296             IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
6297             IDX_IN_SRC(Idx15, 1)),
6298         NotRebased);
6299 #undef IDX_IN_SRC
6300     auto *T1 = makeReg(DestTy);
6301     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6302     _movp(T1, Src1RM);
6303     _pshufb(T1, Mask1M);
6304     _por(T0, T1);
6305   }
6306 
6307   _movp(Dest, T0);
6308 }
6309 
6310 template <typename TraitsType>
6311 void TargetX86Base<TraitsType>::lowerShuffleVector(
6312     const InstShuffleVector *Instr) {
6313   auto *Dest = Instr->getDest();
6314   const Type DestTy = Dest->getType();
6315   auto *Src0 = Instr->getSrc(0);
6316   auto *Src1 = Instr->getSrc(1);
6317   const SizeT NumElements = typeNumElements(DestTy);
6318 
6319   auto *T = makeReg(DestTy);
6320 
6321   switch (DestTy) {
6322   default:
6323     llvm::report_fatal_error("Unexpected vector type.");
6324   case IceType_v16i1:
6325   case IceType_v16i8: {
6326     static constexpr SizeT ExpectedNumElements = 16;
6327     assert(ExpectedNumElements == Instr->getNumIndexes());
6328     (void)ExpectedNumElements;
6329 
6330     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
6331       auto *T = makeReg(DestTy);
6332       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6333       _movp(T, Src0RM);
6334       _punpckl(T, Src0RM);
6335       _movp(Dest, T);
6336       return;
6337     }
6338 
6339     if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
6340                           23)) {
6341       auto *T = makeReg(DestTy);
6342       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6343       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6344       _movp(T, Src0RM);
6345       _punpckl(T, Src1RM);
6346       _movp(Dest, T);
6347       return;
6348     }
6349 
6350     if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
6351                           15, 15)) {
6352       auto *T = makeReg(DestTy);
6353       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6354       _movp(T, Src0RM);
6355       _punpckh(T, Src0RM);
6356       _movp(Dest, T);
6357       return;
6358     }
6359 
6360     if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
6361                           15, 31)) {
6362       auto *T = makeReg(DestTy);
6363       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6364       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6365       _movp(T, Src0RM);
6366       _punpckh(T, Src1RM);
6367       _movp(Dest, T);
6368       return;
6369     }
6370 
6371     if (InstructionSet < Traits::SSE4_1) {
6372       // TODO(jpp): figure out how to lower with sse2.
6373       break;
6374     }
6375 
6376     const SizeT Index0 = Instr->getIndexValue(0);
6377     const SizeT Index1 = Instr->getIndexValue(1);
6378     const SizeT Index2 = Instr->getIndexValue(2);
6379     const SizeT Index3 = Instr->getIndexValue(3);
6380     const SizeT Index4 = Instr->getIndexValue(4);
6381     const SizeT Index5 = Instr->getIndexValue(5);
6382     const SizeT Index6 = Instr->getIndexValue(6);
6383     const SizeT Index7 = Instr->getIndexValue(7);
6384     const SizeT Index8 = Instr->getIndexValue(8);
6385     const SizeT Index9 = Instr->getIndexValue(9);
6386     const SizeT Index10 = Instr->getIndexValue(10);
6387     const SizeT Index11 = Instr->getIndexValue(11);
6388     const SizeT Index12 = Instr->getIndexValue(12);
6389     const SizeT Index13 = Instr->getIndexValue(13);
6390     const SizeT Index14 = Instr->getIndexValue(14);
6391     const SizeT Index15 = Instr->getIndexValue(15);
6392 
6393     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
6394                                    Index3, Index4, Index5, Index6, Index7,
6395                                    Index8, Index9, Index10, Index11, Index12,
6396                                    Index13, Index14, Index15);
6397     return;
6398   }
6399   case IceType_v8i1:
6400   case IceType_v8i16: {
6401     static constexpr SizeT ExpectedNumElements = 8;
6402     assert(ExpectedNumElements == Instr->getNumIndexes());
6403     (void)ExpectedNumElements;
6404 
6405     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
6406       auto *T = makeReg(DestTy);
6407       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6408       _movp(T, Src0RM);
6409       _punpckl(T, Src0RM);
6410       _movp(Dest, T);
6411       return;
6412     }
6413 
6414     if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
6415       auto *T = makeReg(DestTy);
6416       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6417       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6418       _movp(T, Src0RM);
6419       _punpckl(T, Src1RM);
6420       _movp(Dest, T);
6421       return;
6422     }
6423 
6424     if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
6425       auto *T = makeReg(DestTy);
6426       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6427       _movp(T, Src0RM);
6428       _punpckh(T, Src0RM);
6429       _movp(Dest, T);
6430       return;
6431     }
6432 
6433     if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
6434       auto *T = makeReg(DestTy);
6435       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6436       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6437       _movp(T, Src0RM);
6438       _punpckh(T, Src1RM);
6439       _movp(Dest, T);
6440       return;
6441     }
6442 
6443     if (InstructionSet < Traits::SSE4_1) {
6444       // TODO(jpp): figure out how to lower with sse2.
6445       break;
6446     }
6447 
6448     const SizeT Index0 = Instr->getIndexValue(0);
6449     const SizeT Index1 = Instr->getIndexValue(1);
6450     const SizeT Index2 = Instr->getIndexValue(2);
6451     const SizeT Index3 = Instr->getIndexValue(3);
6452     const SizeT Index4 = Instr->getIndexValue(4);
6453     const SizeT Index5 = Instr->getIndexValue(5);
6454     const SizeT Index6 = Instr->getIndexValue(6);
6455     const SizeT Index7 = Instr->getIndexValue(7);
6456 
6457 #define TO_BYTE_INDEX(I) ((I) << 1)
6458     lowerShuffleVector_UsingPshufb(
6459         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
6460         TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
6461         TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
6462         TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
6463         TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
6464         TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
6465         TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
6466         TO_BYTE_INDEX(Index7) + 1);
6467 #undef TO_BYTE_INDEX
6468     return;
6469   }
6470   case IceType_v4i1:
6471   case IceType_v4i32:
6472   case IceType_v4f32: {
6473     static constexpr SizeT ExpectedNumElements = 4;
6474     assert(ExpectedNumElements == Instr->getNumIndexes());
6475     const SizeT Index0 = Instr->getIndexValue(0);
6476     const SizeT Index1 = Instr->getIndexValue(1);
6477     const SizeT Index2 = Instr->getIndexValue(2);
6478     const SizeT Index3 = Instr->getIndexValue(3);
6479     Variable *T = nullptr;
6480     switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
6481 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
6482   case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
6483       CASE_SRCS_IN(0, 0, 0, 0) : {
6484         T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
6485                                               Index3);
6486       }
6487       break;
6488       CASE_SRCS_IN(0, 0, 0, 1) : {
6489         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6490                                                                   Src1, Index3);
6491         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6492                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6493       }
6494       break;
6495       CASE_SRCS_IN(0, 0, 1, 0) : {
6496         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6497                                                                   Src0, Index3);
6498         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6499                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6500       }
6501       break;
6502       CASE_SRCS_IN(0, 0, 1, 1) : {
6503         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
6504                                               Index2, Index3);
6505       }
6506       break;
6507       CASE_SRCS_IN(0, 1, 0, 0) : {
6508         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6509                                                                   Src1, Index1);
6510         T = lowerShuffleVector_TwoFromSameSrc(
6511             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6512       }
6513       break;
6514       CASE_SRCS_IN(0, 1, 0, 1) : {
6515         if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
6516             (Index3 - ExpectedNumElements) == 1) {
6517           auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6518           auto *Src0R = legalizeToReg(Src0);
6519           T = makeReg(DestTy);
6520           _movp(T, Src0R);
6521           _punpckl(T, Src1RM);
6522         } else if (Index0 == Index2 && Index1 == Index3) {
6523           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6524               Src0, Index0, Src1, Index1);
6525           T = lowerShuffleVector_AllFromSameSrc(
6526               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6527               UNIFIED_INDEX_1);
6528         } else {
6529           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6530               Src0, Index0, Src1, Index1);
6531           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6532               Src0, Index2, Src1, Index3);
6533           T = lowerShuffleVector_TwoFromSameSrc(
6534               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6535               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6536         }
6537       }
6538       break;
6539       CASE_SRCS_IN(0, 1, 1, 0) : {
6540         if (Index0 == Index3 && Index1 == Index2) {
6541           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6542               Src0, Index0, Src1, Index1);
6543           T = lowerShuffleVector_AllFromSameSrc(
6544               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6545               UNIFIED_INDEX_0);
6546         } else {
6547           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6548               Src0, Index0, Src1, Index1);
6549           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6550               Src1, Index2, Src0, Index3);
6551           T = lowerShuffleVector_TwoFromSameSrc(
6552               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6553               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6554         }
6555       }
6556       break;
6557       CASE_SRCS_IN(0, 1, 1, 1) : {
6558         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6559                                                                   Src1, Index1);
6560         T = lowerShuffleVector_TwoFromSameSrc(
6561             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6562       }
6563       break;
6564       CASE_SRCS_IN(1, 0, 0, 0) : {
6565         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6566                                                                   Src0, Index1);
6567         T = lowerShuffleVector_TwoFromSameSrc(
6568             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6569       }
6570       break;
6571       CASE_SRCS_IN(1, 0, 0, 1) : {
6572         if (Index0 == Index3 && Index1 == Index2) {
6573           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6574               Src1, Index0, Src0, Index1);
6575           T = lowerShuffleVector_AllFromSameSrc(
6576               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6577               UNIFIED_INDEX_0);
6578         } else {
6579           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6580               Src1, Index0, Src0, Index1);
6581           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6582               Src0, Index2, Src1, Index3);
6583           T = lowerShuffleVector_TwoFromSameSrc(
6584               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6585               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6586         }
6587       }
6588       break;
6589       CASE_SRCS_IN(1, 0, 1, 0) : {
6590         if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
6591             (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
6592           auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
6593           auto *Src0R = legalizeToReg(Src1);
6594           T = makeReg(DestTy);
6595           _movp(T, Src0R);
6596           _punpckl(T, Src1RM);
6597         } else if (Index0 == Index2 && Index1 == Index3) {
6598           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6599               Src1, Index0, Src0, Index1);
6600           T = lowerShuffleVector_AllFromSameSrc(
6601               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6602               UNIFIED_INDEX_1);
6603         } else {
6604           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6605               Src1, Index0, Src0, Index1);
6606           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6607               Src1, Index2, Src0, Index3);
6608           T = lowerShuffleVector_TwoFromSameSrc(
6609               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6610               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6611         }
6612       }
6613       break;
6614       CASE_SRCS_IN(1, 0, 1, 1) : {
6615         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6616                                                                   Src0, Index1);
6617         T = lowerShuffleVector_TwoFromSameSrc(
6618             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6619       }
6620       break;
6621       CASE_SRCS_IN(1, 1, 0, 0) : {
6622         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
6623                                               Index2, Index3);
6624       }
6625       break;
6626       CASE_SRCS_IN(1, 1, 0, 1) : {
6627         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6628                                                                   Src1, Index3);
6629         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6630                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6631       }
6632       break;
6633       CASE_SRCS_IN(1, 1, 1, 0) : {
6634         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6635                                                                   Src0, Index3);
6636         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6637                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6638       }
6639       break;
6640       CASE_SRCS_IN(1, 1, 1, 1) : {
6641         T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
6642                                               Index3);
6643       }
6644       break;
6645 #undef CASE_SRCS_IN
6646     }
6647 
6648     assert(T != nullptr);
6649     assert(T->getType() == DestTy);
6650     _movp(Dest, T);
6651     return;
6652   } break;
6653   }
6654 
6655   // Unoptimized shuffle. Perform a series of inserts and extracts.
6656   Context.insert<InstFakeDef>(T);
6657   const Type ElementType = typeElementType(DestTy);
6658   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
6659     auto *Index = Instr->getIndex(I);
6660     const SizeT Elem = Index->getValue();
6661     auto *ExtElmt = makeReg(ElementType);
6662     if (Elem < NumElements) {
6663       lowerExtractElement(
6664           InstExtractElement::create(Func, ExtElmt, Src0, Index));
6665     } else {
6666       lowerExtractElement(InstExtractElement::create(
6667           Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
6668     }
6669     auto *NewT = makeReg(DestTy);
6670     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
6671                                                  Ctx->getConstantInt32(I)));
6672     T = NewT;
6673   }
6674   _movp(Dest, T);
6675 }
6676 
6677 template <typename TraitsType>
6678 void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
6679   Variable *Dest = Select->getDest();
6680 
6681   Operand *Condition = Select->getCondition();
6682   // Handle folding opportunities.
6683   if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
6684     assert(Producer->isDeleted());
6685     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
6686     default:
6687       break;
6688     case BoolFolding<Traits>::PK_Icmp32:
6689     case BoolFolding<Traits>::PK_Icmp64: {
6690       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
6691       return;
6692     }
6693     case BoolFolding<Traits>::PK_Fcmp: {
6694       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
6695       return;
6696     }
6697     }
6698   }
6699 
6700   if (isVectorType(Dest->getType())) {
6701     lowerSelectVector(Select);
6702     return;
6703   }
6704 
6705   Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
6706   Operand *Zero = Ctx->getConstantZero(IceType_i32);
6707   _cmp(CmpResult, Zero);
6708   Operand *SrcT = Select->getTrueOperand();
6709   Operand *SrcF = Select->getFalseOperand();
6710   const BrCond Cond = Traits::Cond::Br_ne;
6711   lowerSelectMove(Dest, Cond, SrcT, SrcF);
6712 }
6713 
6714 template <typename TraitsType>
6715 void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond,
6716                                                 Operand *SrcT, Operand *SrcF) {
6717   Type DestTy = Dest->getType();
6718   if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
6719     // The cmov instruction doesn't allow 8-bit or FP operands, so we need
6720     // explicit control flow.
6721     // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
6722     auto *Label = InstX86Label::create(Func, this);
6723     SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
6724     _mov(Dest, SrcT);
6725     _br(Cond, Label);
6726     SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
6727     _redefined(_mov(Dest, SrcF));
6728     Context.insert(Label);
6729     return;
6730   }
6731   // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
6732   // But if SrcT is immediate, we might be able to do better, as the cmov
6733   // instruction doesn't allow an immediate operand:
6734   // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
6735   if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
6736     std::swap(SrcT, SrcF);
6737     Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond);
6738   }
6739   if (!Traits::Is64Bit && DestTy == IceType_i64) {
6740     SrcT = legalizeUndef(SrcT);
6741     SrcF = legalizeUndef(SrcF);
6742     // Set the low portion.
6743     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6744     lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
6745     // Set the high portion.
6746     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6747     lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
6748     return;
6749   }
6750 
6751   assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
6752          (Traits::Is64Bit && DestTy == IceType_i64));
6753   lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
6754 }
6755 
6756 template <typename TraitsType>
6757 void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond,
6758                                                    Operand *SrcT,
6759                                                    Operand *SrcF) {
6760   Variable *T = nullptr;
6761   SrcF = legalize(SrcF);
6762   _mov(T, SrcF);
6763   SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
6764   _cmov(T, SrcT, Cond);
6765   _mov(Dest, T);
6766 }
6767 
6768 template <typename TraitsType>
6769 void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src,
6770                                           bool IsRedefinition) {
6771   assert(Dest->getType() == Src->getType());
6772   assert(!Dest->isRematerializable());
6773   if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
6774     Src = legalize(Src);
6775     Operand *SrcLo = loOperand(Src);
6776     Operand *SrcHi = hiOperand(Src);
6777     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6778     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6779     Variable *T_Lo = nullptr, *T_Hi = nullptr;
6780     _mov(T_Lo, SrcLo);
6781     _redefined(_mov(DestLo, T_Lo), IsRedefinition);
6782     _mov(T_Hi, SrcHi);
6783     _redefined(_mov(DestHi, T_Hi), IsRedefinition);
6784   } else {
6785     Operand *SrcLegal;
6786     if (Dest->hasReg()) {
6787       // If Dest already has a physical register, then only basic legalization
6788       // is needed, as the source operand can be a register, immediate, or
6789       // memory.
6790       SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
6791     } else {
6792       // If Dest could be a stack operand, then RI must be a physical register
6793       // or a scalar integer immediate.
6794       SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
6795     }
6796     if (isVectorType(Dest->getType())) {
6797       _redefined(_movp(Dest, SrcLegal), IsRedefinition);
6798     } else {
6799       _redefined(_mov(Dest, SrcLegal), IsRedefinition);
6800     }
6801   }
6802 }
6803 
6804 template <typename TraitsType>
6805 bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect(
6806     const InstFcmp *Fcmp, const InstSelect *Select) {
6807   Operand *CmpSrc0 = Fcmp->getSrc(0);
6808   Operand *CmpSrc1 = Fcmp->getSrc(1);
6809   Operand *SelectSrcT = Select->getTrueOperand();
6810   Operand *SelectSrcF = Select->getFalseOperand();
6811   Variable *SelectDest = Select->getDest();
6812 
6813   // TODO(capn): also handle swapped compare/select operand order.
6814   if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
6815     return false;
6816 
6817   // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
6818   InstFcmp::FCond Condition = Fcmp->getCondition();
6819   switch (Condition) {
6820   default:
6821     return false;
6822   case InstFcmp::True:
6823     break;
6824   case InstFcmp::False:
6825     break;
6826   case InstFcmp::Ogt: {
6827     Variable *T = makeReg(SelectDest->getType());
6828     if (isScalarFloatingType(SelectSrcT->getType())) {
6829       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6830       _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6831       _mov(SelectDest, T);
6832     } else {
6833       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6834       _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6835       _movp(SelectDest, T);
6836     }
6837     return true;
6838   } break;
6839   case InstFcmp::Olt: {
6840     Variable *T = makeReg(SelectSrcT->getType());
6841     if (isScalarFloatingType(SelectSrcT->getType())) {
6842       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6843       _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6844       _mov(SelectDest, T);
6845     } else {
6846       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6847       _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6848       _movp(SelectDest, T);
6849     }
6850     return true;
6851   } break;
6852   }
6853   return false;
6854 }
6855 
6856 template <typename TraitsType>
6857 void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) {
6858   Variable *Dest = Icmp->getDest();
6859   if (isVectorType(Dest->getType())) {
6860     lowerIcmpVector(Icmp);
6861   } else {
6862     constexpr Inst *Consumer = nullptr;
6863     lowerIcmpAndConsumer(Icmp, Consumer);
6864   }
6865 }
6866 
6867 template <typename TraitsType>
6868 void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) {
6869   Variable *Dest = Instr->getDest();
6870   Type DestTy = Dest->getType();
6871   Operand *SrcT = Instr->getTrueOperand();
6872   Operand *SrcF = Instr->getFalseOperand();
6873   Operand *Condition = Instr->getCondition();
6874 
6875   if (!isVectorType(DestTy))
6876     llvm::report_fatal_error("Expected a vector select");
6877 
6878   Type SrcTy = SrcT->getType();
6879   Variable *T = makeReg(SrcTy);
6880   Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
6881   Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
6882 
6883   if (InstructionSet >= Traits::SSE4_1) {
6884     // TODO(wala): If the condition operand is a constant, use blendps or
6885     // pblendw.
6886     //
6887     // Use blendvps or pblendvb to implement select.
6888     if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
6889         SrcTy == IceType_v4f32) {
6890       Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6891       Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
6892       _movp(xmm0, ConditionRM);
6893       _psll(xmm0, Ctx->getConstantInt8(31));
6894       _movp(T, SrcFRM);
6895       _blendvps(T, SrcTRM, xmm0);
6896       _movp(Dest, T);
6897     } else {
6898       assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
6899       Type SignExtTy =
6900           Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
6901       Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
6902       lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
6903       _movp(T, SrcFRM);
6904       _pblendvb(T, SrcTRM, xmm0);
6905       _movp(Dest, T);
6906     }
6907     return;
6908   }
6909   // Lower select without Traits::SSE4.1:
6910   // a=d?b:c ==>
6911   //   if elementtype(d) != i1:
6912   //      d=sext(d);
6913   //   a=(b&d)|(c&~d);
6914   Variable *T2 = makeReg(SrcTy);
6915   // Sign extend the condition operand if applicable.
6916   if (SrcTy == IceType_v4f32) {
6917     // The sext operation takes only integer arguments.
6918     Variable *T3 = Func->makeVariable(IceType_v4i32);
6919     lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
6920     _movp(T, T3);
6921   } else if (typeElementType(SrcTy) != IceType_i1) {
6922     lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
6923   } else {
6924     Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6925     _movp(T, ConditionRM);
6926   }
6927   _movp(T2, T);
6928   _pand(T, SrcTRM);
6929   _pandn(T2, SrcFRM);
6930   _por(T, T2);
6931   _movp(Dest, T);
6932 
6933   return;
6934 }
6935 
6936 template <typename TraitsType>
6937 void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) {
6938   Operand *Value = Instr->getData();
6939   Operand *Addr = Instr->getStoreAddress();
6940   X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
6941   doMockBoundsCheck(NewAddr);
6942   Type Ty = NewAddr->getType();
6943 
6944   if (!Traits::Is64Bit && Ty == IceType_i64) {
6945     Value = legalizeUndef(Value);
6946     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
6947     _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
6948     Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
6949     _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
6950   } else if (isVectorType(Ty)) {
6951     _storep(legalizeToReg(Value), NewAddr);
6952   } else {
6953     Value = legalize(Value, Legal_Reg | Legal_Imm);
6954     _store(Value, NewAddr);
6955   }
6956 }
6957 
6958 template <typename TraitsType>
6959 void TargetX86Base<TraitsType>::doAddressOptStore() {
6960   auto *Instr = llvm::cast<InstStore>(Context.getCur());
6961   Operand *Addr = Instr->getStoreAddress();
6962   Operand *Data = Instr->getData();
6963   if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
6964     Instr->setDeleted();
6965     auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
6966     if (Instr->getDest())
6967       NewStore->setRmwBeacon(Instr->getRmwBeacon());
6968   }
6969 }
6970 
6971 template <typename TraitsType>
6972 void TargetX86Base<TraitsType>::doAddressOptStoreSubVector() {
6973   auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
6974   Operand *Addr = Intrinsic->getArg(1);
6975   Operand *Data = Intrinsic->getArg(0);
6976   if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
6977     Intrinsic->setDeleted();
6978     const Ice::Intrinsics::IntrinsicInfo Info = {
6979         Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
6980         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
6981     auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
6982     NewStore->addArg(Data);
6983     NewStore->addArg(OptAddr);
6984     NewStore->addArg(Intrinsic->getArg(2));
6985   }
6986 }
6987 
6988 template <typename TraitsType>
6989 Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison,
6990                                                   uint64_t Min, uint64_t Max) {
6991   // TODO(ascull): 64-bit should not reach here but only because it is not
6992   // implemented yet. This should be able to handle the 64-bit case.
6993   assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
6994   // Subtracting 0 is a nop so don't do it
6995   if (Min != 0) {
6996     // Avoid clobbering the comparison by copying it
6997     Variable *T = nullptr;
6998     _mov(T, Comparison);
6999     _sub(T, Ctx->getConstantInt32(Min));
7000     Comparison = T;
7001   }
7002 
7003   _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
7004 
7005   return Comparison;
7006 }
7007 
7008 template <typename TraitsType>
7009 void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case,
7010                                                  Operand *Comparison,
7011                                                  bool DoneCmp,
7012                                                  CfgNode *DefaultTarget) {
7013   switch (Case.getKind()) {
7014   case CaseCluster::JumpTable: {
7015     InstX86Label *SkipJumpTable;
7016 
7017     Operand *RangeIndex =
7018         lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
7019     if (DefaultTarget == nullptr) {
7020       // Skip over jump table logic if comparison not in range and no default
7021       SkipJumpTable = InstX86Label::create(Func, this);
7022       _br(Traits::Cond::Br_a, SkipJumpTable);
7023     } else {
7024       _br(Traits::Cond::Br_a, DefaultTarget);
7025     }
7026 
7027     InstJumpTable *JumpTable = Case.getJumpTable();
7028     Context.insert(JumpTable);
7029 
7030     // Make sure the index is a register of the same width as the base
7031     Variable *Index;
7032     const Type PointerType = getPointerType();
7033     if (RangeIndex->getType() != PointerType) {
7034       Index = makeReg(PointerType);
7035       if (RangeIndex->getType() == IceType_i64) {
7036         assert(Traits::Is64Bit);
7037         _mov(Index, RangeIndex); // trunc
7038       } else {
7039         Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
7040         _movzx(Index, RangeIndexRM);
7041       }
7042     } else {
7043       Index = legalizeToReg(RangeIndex);
7044     }
7045 
7046     constexpr RelocOffsetT RelocOffset = 0;
7047     constexpr Variable *NoBase = nullptr;
7048     constexpr Constant *NoOffset = nullptr;
7049     auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
7050     Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
7051     uint16_t Shift = typeWidthInBytesLog2(PointerType);
7052     constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
7053 
7054     Variable *Target = nullptr;
7055     if (Traits::Is64Bit && NeedSandboxing) {
7056       assert(Index != nullptr && Index->getType() == IceType_i32);
7057     }
7058 
7059     if (PointerType == IceType_i32) {
7060       _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
7061                                          Index, Shift, Segment));
7062     } else {
7063       auto *Base = makeReg(IceType_i64);
7064       _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
7065       _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
7066                                          Index, Shift, Segment));
7067     }
7068 
7069     lowerIndirectJump(Target);
7070 
7071     if (DefaultTarget == nullptr)
7072       Context.insert(SkipJumpTable);
7073     return;
7074   }
7075   case CaseCluster::Range: {
7076     if (Case.isUnitRange()) {
7077       // Single item
7078       if (!DoneCmp) {
7079         Constant *Value = Ctx->getConstantInt32(Case.getLow());
7080         _cmp(Comparison, Value);
7081       }
7082       _br(Traits::Cond::Br_e, Case.getTarget());
7083     } else if (DoneCmp && Case.isPairRange()) {
7084       // Range of two items with first item aleady compared against
7085       _br(Traits::Cond::Br_e, Case.getTarget());
7086       Constant *Value = Ctx->getConstantInt32(Case.getHigh());
7087       _cmp(Comparison, Value);
7088       _br(Traits::Cond::Br_e, Case.getTarget());
7089     } else {
7090       // Range
7091       lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
7092       _br(Traits::Cond::Br_be, Case.getTarget());
7093     }
7094     if (DefaultTarget != nullptr)
7095       _br(DefaultTarget);
7096     return;
7097   }
7098   }
7099 }
7100 
7101 template <typename TraitsType>
7102 void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) {
7103   // Group cases together and navigate through them with a binary search
7104   CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
7105   Operand *Src0 = Instr->getComparison();
7106   CfgNode *DefaultTarget = Instr->getLabelDefault();
7107 
7108   assert(CaseClusters.size() != 0); // Should always be at least one
7109 
7110   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
7111     Src0 = legalize(Src0); // get Base/Index into physical registers
7112     Operand *Src0Lo = loOperand(Src0);
7113     Operand *Src0Hi = hiOperand(Src0);
7114     if (CaseClusters.back().getHigh() > UINT32_MAX) {
7115       // TODO(ascull): handle 64-bit case properly (currently naive version)
7116       // This might be handled by a higher level lowering of switches.
7117       SizeT NumCases = Instr->getNumCases();
7118       if (NumCases >= 2) {
7119         Src0Lo = legalizeToReg(Src0Lo);
7120         Src0Hi = legalizeToReg(Src0Hi);
7121       } else {
7122         Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
7123         Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
7124       }
7125       for (SizeT I = 0; I < NumCases; ++I) {
7126         Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
7127         Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
7128         InstX86Label *Label = InstX86Label::create(Func, this);
7129         _cmp(Src0Lo, ValueLo);
7130         _br(Traits::Cond::Br_ne, Label);
7131         _cmp(Src0Hi, ValueHi);
7132         _br(Traits::Cond::Br_e, Instr->getLabel(I));
7133         Context.insert(Label);
7134       }
7135       _br(Instr->getLabelDefault());
7136       return;
7137     } else {
7138       // All the values are 32-bit so just check the operand is too and then
7139       // fall through to the 32-bit implementation. This is a common case.
7140       Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
7141       Constant *Zero = Ctx->getConstantInt32(0);
7142       _cmp(Src0Hi, Zero);
7143       _br(Traits::Cond::Br_ne, DefaultTarget);
7144       Src0 = Src0Lo;
7145     }
7146   }
7147 
7148   // 32-bit lowering
7149 
7150   if (CaseClusters.size() == 1) {
7151     // Jump straight to default if needed. Currently a common case as jump
7152     // tables occur on their own.
7153     constexpr bool DoneCmp = false;
7154     lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
7155     return;
7156   }
7157 
7158   // Going to be using multiple times so get it in a register early
7159   Variable *Comparison = legalizeToReg(Src0);
7160 
7161   // A span is over the clusters
7162   struct SearchSpan {
7163     SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
7164         : Begin(Begin), Size(Size), Label(Label) {}
7165 
7166     SizeT Begin;
7167     SizeT Size;
7168     InstX86Label *Label;
7169   };
7170   // The stack will only grow to the height of the tree so 12 should be plenty
7171   std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
7172   SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
7173   bool DoneCmp = false;
7174 
7175   while (!SearchSpanStack.empty()) {
7176     SearchSpan Span = SearchSpanStack.top();
7177     SearchSpanStack.pop();
7178 
7179     if (Span.Label != nullptr)
7180       Context.insert(Span.Label);
7181 
7182     switch (Span.Size) {
7183     case 0:
7184       llvm::report_fatal_error("Invalid SearchSpan size");
7185       break;
7186 
7187     case 1:
7188       lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
7189                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
7190       DoneCmp = false;
7191       break;
7192 
7193     case 2: {
7194       const CaseCluster *CaseA = &CaseClusters[Span.Begin];
7195       const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
7196 
7197       // Placing a range last may allow register clobbering during the range
7198       // test. That means there is no need to clone the register. If it is a
7199       // unit range the comparison may have already been done in the binary
7200       // search (DoneCmp) and so it should be placed first. If this is a range
7201       // of two items and the comparison with the low value has already been
7202       // done, comparing with the other element is cheaper than a range test.
7203       // If the low end of the range is zero then there is no subtraction and
7204       // nothing to be gained.
7205       if (!CaseA->isUnitRange() &&
7206           !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
7207         std::swap(CaseA, CaseB);
7208         DoneCmp = false;
7209       }
7210 
7211       lowerCaseCluster(*CaseA, Comparison, DoneCmp);
7212       DoneCmp = false;
7213       lowerCaseCluster(*CaseB, Comparison, DoneCmp,
7214                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
7215     } break;
7216 
7217     default:
7218       // Pick the middle item and branch b or ae
7219       SizeT PivotIndex = Span.Begin + (Span.Size / 2);
7220       const CaseCluster &Pivot = CaseClusters[PivotIndex];
7221       Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
7222       InstX86Label *Label = InstX86Label::create(Func, this);
7223       _cmp(Comparison, Value);
7224       // TODO(ascull): does it alway have to be far?
7225       _br(Traits::Cond::Br_b, Label, InstX86Br::Far);
7226       // Lower the left and (pivot+right) sides, falling through to the right
7227       SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
7228       SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
7229       DoneCmp = true;
7230       break;
7231     }
7232   }
7233 
7234   _br(DefaultTarget);
7235 }
7236 
7237 /// The following pattern occurs often in lowered C and C++ code:
7238 ///
7239 ///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
7240 ///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
7241 ///
7242 /// We can eliminate the sext operation by copying the result of pcmpeqd,
7243 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
7244 /// sext operation.
7245 template <typename TraitsType>
7246 void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction(
7247     Variable *SignExtendedResult) {
7248   if (auto *NextCast =
7249           llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
7250     if (NextCast->getCastKind() == InstCast::Sext &&
7251         NextCast->getSrc(0) == SignExtendedResult) {
7252       NextCast->setDeleted();
7253       _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
7254       // Skip over the instruction.
7255       Context.advanceNext();
7256     }
7257   }
7258 }
7259 
7260 template <typename TraitsType>
7261 void TargetX86Base<TraitsType>::lowerUnreachable(
7262     const InstUnreachable * /*Instr*/) {
7263   _ud2();
7264   // Add a fake use of esp to make sure esp adjustments after the unreachable
7265   // do not get dead-code eliminated.
7266   keepEspLiveAtExit();
7267 }
7268 
7269 template <typename TraitsType>
7270 void TargetX86Base<TraitsType>::lowerBreakpoint(
7271     const InstBreakpoint * /*Instr*/) {
7272   _int3();
7273 }
7274 
7275 template <typename TraitsType>
7276 void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) {
7277   // If the beacon variable's live range does not end in this instruction, then
7278   // it must end in the modified Store instruction that follows. This means
7279   // that the original Store instruction is still there, either because the
7280   // value being stored is used beyond the Store instruction, or because dead
7281   // code elimination did not happen. In either case, we cancel RMW lowering
7282   // (and the caller deletes the RMW instruction).
7283   if (!RMW->isLastUse(RMW->getBeacon()))
7284     return;
7285   Operand *Src = RMW->getData();
7286   Type Ty = Src->getType();
7287   X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
7288   doMockBoundsCheck(Addr);
7289   if (!Traits::Is64Bit && Ty == IceType_i64) {
7290     Src = legalizeUndef(Src);
7291     Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
7292     Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
7293     auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
7294     auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
7295     switch (RMW->getOp()) {
7296     default:
7297       // TODO(stichnot): Implement other arithmetic operators.
7298       break;
7299     case InstArithmetic::Add:
7300       _add_rmw(AddrLo, SrcLo);
7301       _adc_rmw(AddrHi, SrcHi);
7302       return;
7303     case InstArithmetic::Sub:
7304       _sub_rmw(AddrLo, SrcLo);
7305       _sbb_rmw(AddrHi, SrcHi);
7306       return;
7307     case InstArithmetic::And:
7308       _and_rmw(AddrLo, SrcLo);
7309       _and_rmw(AddrHi, SrcHi);
7310       return;
7311     case InstArithmetic::Or:
7312       _or_rmw(AddrLo, SrcLo);
7313       _or_rmw(AddrHi, SrcHi);
7314       return;
7315     case InstArithmetic::Xor:
7316       _xor_rmw(AddrLo, SrcLo);
7317       _xor_rmw(AddrHi, SrcHi);
7318       return;
7319     }
7320   } else {
7321     // x86-32: i8, i16, i32
7322     // x86-64: i8, i16, i32, i64
7323     switch (RMW->getOp()) {
7324     default:
7325       // TODO(stichnot): Implement other arithmetic operators.
7326       break;
7327     case InstArithmetic::Add:
7328       Src = legalize(Src, Legal_Reg | Legal_Imm);
7329       _add_rmw(Addr, Src);
7330       return;
7331     case InstArithmetic::Sub:
7332       Src = legalize(Src, Legal_Reg | Legal_Imm);
7333       _sub_rmw(Addr, Src);
7334       return;
7335     case InstArithmetic::And:
7336       Src = legalize(Src, Legal_Reg | Legal_Imm);
7337       _and_rmw(Addr, Src);
7338       return;
7339     case InstArithmetic::Or:
7340       Src = legalize(Src, Legal_Reg | Legal_Imm);
7341       _or_rmw(Addr, Src);
7342       return;
7343     case InstArithmetic::Xor:
7344       Src = legalize(Src, Legal_Reg | Legal_Imm);
7345       _xor_rmw(Addr, Src);
7346       return;
7347     }
7348   }
7349   llvm::report_fatal_error("Couldn't lower RMW instruction");
7350 }
7351 
7352 template <typename TraitsType>
7353 void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) {
7354   if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
7355     lowerRMW(RMW);
7356   } else {
7357     TargetLowering::lowerOther(Instr);
7358   }
7359 }
7360 
7361 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
7362 /// integrity of liveness analysis. Undef values are also turned into zeroes,
7363 /// since loOperand() and hiOperand() don't expect Undef input.  Also, in
7364 /// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand.
7365 template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() {
7366   if (getFlags().getUseNonsfi()) {
7367     assert(RebasePtr);
7368     CfgNode *Node = Context.getNode();
7369     uint32_t RebasePtrUseCount = 0;
7370     for (Inst &I : Node->getPhis()) {
7371       auto *Phi = llvm::dyn_cast<InstPhi>(&I);
7372       if (Phi->isDeleted())
7373         continue;
7374       for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
7375         Operand *Src = Phi->getSrc(I);
7376         // TODO(stichnot): This over-counts for +0.0, and under-counts for other
7377         // kinds of pooling.
7378         if (llvm::isa<ConstantRelocatable>(Src) ||
7379             llvm::isa<ConstantFloat>(Src) || llvm::isa<ConstantDouble>(Src)) {
7380           ++RebasePtrUseCount;
7381         }
7382       }
7383     }
7384     if (RebasePtrUseCount) {
7385       Node->getInsts().push_front(InstFakeUse::create(Func, RebasePtr));
7386     }
7387   }
7388   if (Traits::Is64Bit) {
7389     // On x86-64 we don't need to prelower phis -- the architecture can handle
7390     // 64-bit integer natively.
7391     return;
7392   }
7393 
7394   PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>(
7395       this, Context.getNode(), Func);
7396 }
7397 
7398 template <typename TraitsType>
7399 void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) {
7400   uint32_t StackArgumentsSize = 0;
7401   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
7402     RuntimeHelper HelperID = RuntimeHelper::H_Num;
7403     Variable *Dest = Arith->getDest();
7404     Type DestTy = Dest->getType();
7405     if (!Traits::Is64Bit && DestTy == IceType_i64) {
7406       switch (Arith->getOp()) {
7407       default:
7408         return;
7409       case InstArithmetic::Udiv:
7410         HelperID = RuntimeHelper::H_udiv_i64;
7411         break;
7412       case InstArithmetic::Sdiv:
7413         HelperID = RuntimeHelper::H_sdiv_i64;
7414         break;
7415       case InstArithmetic::Urem:
7416         HelperID = RuntimeHelper::H_urem_i64;
7417         break;
7418       case InstArithmetic::Srem:
7419         HelperID = RuntimeHelper::H_srem_i64;
7420         break;
7421       }
7422     } else if (isVectorType(DestTy)) {
7423       Variable *Dest = Arith->getDest();
7424       Operand *Src0 = Arith->getSrc(0);
7425       Operand *Src1 = Arith->getSrc(1);
7426       switch (Arith->getOp()) {
7427       default:
7428         return;
7429       case InstArithmetic::Mul:
7430         if (DestTy == IceType_v16i8) {
7431           scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
7432           Arith->setDeleted();
7433         }
7434         return;
7435       case InstArithmetic::Shl:
7436       case InstArithmetic::Lshr:
7437       case InstArithmetic::Ashr:
7438         if (llvm::isa<Constant>(Src1)) {
7439           return;
7440         }
7441       case InstArithmetic::Udiv:
7442       case InstArithmetic::Urem:
7443       case InstArithmetic::Sdiv:
7444       case InstArithmetic::Srem:
7445       case InstArithmetic::Frem:
7446         scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
7447         Arith->setDeleted();
7448         return;
7449       }
7450     } else {
7451       switch (Arith->getOp()) {
7452       default:
7453         return;
7454       case InstArithmetic::Frem:
7455         if (isFloat32Asserting32Or64(DestTy))
7456           HelperID = RuntimeHelper::H_frem_f32;
7457         else
7458           HelperID = RuntimeHelper::H_frem_f64;
7459       }
7460     }
7461     constexpr SizeT MaxSrcs = 2;
7462     InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
7463     Call->addArg(Arith->getSrc(0));
7464     Call->addArg(Arith->getSrc(1));
7465     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7466     Context.insert(Call);
7467     Arith->setDeleted();
7468   } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
7469     InstCast::OpKind CastKind = Cast->getCastKind();
7470     Operand *Src0 = Cast->getSrc(0);
7471     const Type SrcType = Src0->getType();
7472     Variable *Dest = Cast->getDest();
7473     const Type DestTy = Dest->getType();
7474     RuntimeHelper HelperID = RuntimeHelper::H_Num;
7475     Variable *CallDest = Dest;
7476     switch (CastKind) {
7477     default:
7478       return;
7479     case InstCast::Fptosi:
7480       if (!Traits::Is64Bit && DestTy == IceType_i64) {
7481         HelperID = isFloat32Asserting32Or64(SrcType)
7482                        ? RuntimeHelper::H_fptosi_f32_i64
7483                        : RuntimeHelper::H_fptosi_f64_i64;
7484       } else {
7485         return;
7486       }
7487       break;
7488     case InstCast::Fptoui:
7489       if (isVectorType(DestTy)) {
7490         assert(DestTy == IceType_v4i32);
7491         assert(SrcType == IceType_v4f32);
7492         HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
7493       } else if (DestTy == IceType_i64 ||
7494                  (!Traits::Is64Bit && DestTy == IceType_i32)) {
7495         if (Traits::Is64Bit) {
7496           HelperID = isFloat32Asserting32Or64(SrcType)
7497                          ? RuntimeHelper::H_fptoui_f32_i64
7498                          : RuntimeHelper::H_fptoui_f64_i64;
7499         } else if (isInt32Asserting32Or64(DestTy)) {
7500           HelperID = isFloat32Asserting32Or64(SrcType)
7501                          ? RuntimeHelper::H_fptoui_f32_i32
7502                          : RuntimeHelper::H_fptoui_f64_i32;
7503         } else {
7504           HelperID = isFloat32Asserting32Or64(SrcType)
7505                          ? RuntimeHelper::H_fptoui_f32_i64
7506                          : RuntimeHelper::H_fptoui_f64_i64;
7507         }
7508       } else {
7509         return;
7510       }
7511       break;
7512     case InstCast::Sitofp:
7513       if (!Traits::Is64Bit && SrcType == IceType_i64) {
7514         HelperID = isFloat32Asserting32Or64(DestTy)
7515                        ? RuntimeHelper::H_sitofp_i64_f32
7516                        : RuntimeHelper::H_sitofp_i64_f64;
7517       } else {
7518         return;
7519       }
7520       break;
7521     case InstCast::Uitofp:
7522       if (isVectorType(SrcType)) {
7523         assert(DestTy == IceType_v4f32);
7524         assert(SrcType == IceType_v4i32);
7525         HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
7526       } else if (SrcType == IceType_i64 ||
7527                  (!Traits::Is64Bit && SrcType == IceType_i32)) {
7528         if (isInt32Asserting32Or64(SrcType)) {
7529           HelperID = isFloat32Asserting32Or64(DestTy)
7530                          ? RuntimeHelper::H_uitofp_i32_f32
7531                          : RuntimeHelper::H_uitofp_i32_f64;
7532         } else {
7533           HelperID = isFloat32Asserting32Or64(DestTy)
7534                          ? RuntimeHelper::H_uitofp_i64_f32
7535                          : RuntimeHelper::H_uitofp_i64_f64;
7536         }
7537       } else {
7538         return;
7539       }
7540       break;
7541     case InstCast::Bitcast: {
7542       if (DestTy == Src0->getType())
7543         return;
7544       switch (DestTy) {
7545       default:
7546         return;
7547       case IceType_i8:
7548         assert(Src0->getType() == IceType_v8i1);
7549         HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
7550         CallDest = Func->makeVariable(IceType_i32);
7551         break;
7552       case IceType_i16:
7553         assert(Src0->getType() == IceType_v16i1);
7554         HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
7555         CallDest = Func->makeVariable(IceType_i32);
7556         break;
7557       case IceType_v8i1: {
7558         assert(Src0->getType() == IceType_i8);
7559         HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
7560         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7561         // Arguments to functions are required to be at least 32 bits wide.
7562         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7563         Src0 = Src0AsI32;
7564       } break;
7565       case IceType_v16i1: {
7566         assert(Src0->getType() == IceType_i16);
7567         HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
7568         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7569         // Arguments to functions are required to be at least 32 bits wide.
7570         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7571         Src0 = Src0AsI32;
7572       } break;
7573       }
7574     } break;
7575     }
7576     constexpr SizeT MaxSrcs = 1;
7577     InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
7578     Call->addArg(Src0);
7579     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7580     Context.insert(Call);
7581     // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
7582     // result to the appropriate type as necessary.
7583     if (CallDest->getType() != Dest->getType())
7584       Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
7585     Cast->setDeleted();
7586   } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
7587     CfgVector<Type> ArgTypes;
7588     Type ReturnType = IceType_void;
7589     switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicID()) {
7590     default:
7591       return;
7592     case Intrinsics::Ctpop: {
7593       Operand *Val = Intrinsic->getArg(0);
7594       Type ValTy = Val->getType();
7595       if (ValTy == IceType_i64)
7596         ArgTypes = {IceType_i64};
7597       else
7598         ArgTypes = {IceType_i32};
7599       ReturnType = IceType_i32;
7600     } break;
7601     case Intrinsics::Longjmp:
7602       ArgTypes = {IceType_i32, IceType_i32};
7603       ReturnType = IceType_void;
7604       break;
7605     case Intrinsics::Memcpy:
7606       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7607       ReturnType = IceType_void;
7608       break;
7609     case Intrinsics::Memmove:
7610       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7611       ReturnType = IceType_void;
7612       break;
7613     case Intrinsics::Memset:
7614       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7615       ReturnType = IceType_void;
7616       break;
7617     case Intrinsics::NaClReadTP:
7618       ReturnType = IceType_i32;
7619       break;
7620     case Intrinsics::Setjmp:
7621       ArgTypes = {IceType_i32};
7622       ReturnType = IceType_i32;
7623       break;
7624     }
7625     StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7626   } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
7627     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7628   } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
7629     if (!Ret->hasRetValue())
7630       return;
7631     Operand *RetValue = Ret->getRetValue();
7632     Type ReturnType = RetValue->getType();
7633     if (!isScalarFloatingType(ReturnType))
7634       return;
7635     StackArgumentsSize = typeWidthInBytes(ReturnType);
7636   } else {
7637     return;
7638   }
7639   StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
7640   updateMaxOutArgsSizeBytes(StackArgumentsSize);
7641 }
7642 
7643 template <typename TraitsType>
7644 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
7645     const CfgVector<Type> &ArgTypes, Type ReturnType) {
7646   uint32_t OutArgumentsSizeBytes = 0;
7647   uint32_t XmmArgCount = 0;
7648   uint32_t GprArgCount = 0;
7649   for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
7650     Type Ty = ArgTypes[i];
7651     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
7652     assert(typeWidthInBytes(Ty) >= 4);
7653     if (isVectorType(Ty) &&
7654         Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgCount))
7655             .hasValue()) {
7656       ++XmmArgCount;
7657     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
7658                Traits::getRegisterForXmmArgNum(
7659                    Traits::getArgIndex(i, XmmArgCount))
7660                    .hasValue()) {
7661       ++XmmArgCount;
7662     } else if (isScalarIntegerType(Ty) &&
7663                Traits::getRegisterForGprArgNum(
7664                    Ty, Traits::getArgIndex(i, GprArgCount))
7665                    .hasValue()) {
7666       // The 64 bit ABI allows some integers to be passed in GPRs.
7667       ++GprArgCount;
7668     } else {
7669       if (isVectorType(Ty)) {
7670         OutArgumentsSizeBytes =
7671             Traits::applyStackAlignment(OutArgumentsSizeBytes);
7672       }
7673       OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
7674     }
7675   }
7676   if (Traits::Is64Bit)
7677     return OutArgumentsSizeBytes;
7678   // The 32 bit ABI requires floating point values to be returned on the x87 FP
7679   // stack. Ensure there is enough space for the fstp/movs for floating returns.
7680   if (isScalarFloatingType(ReturnType)) {
7681     OutArgumentsSizeBytes =
7682         std::max(OutArgumentsSizeBytes,
7683                  static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
7684   }
7685   return OutArgumentsSizeBytes;
7686 }
7687 
7688 template <typename TraitsType>
7689 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
7690     const InstCall *Instr) {
7691   // Build a vector of the arguments' types.
7692   const SizeT NumArgs = Instr->getNumArgs();
7693   CfgVector<Type> ArgTypes;
7694   ArgTypes.reserve(NumArgs);
7695   for (SizeT i = 0; i < NumArgs; ++i) {
7696     Operand *Arg = Instr->getArg(i);
7697     ArgTypes.emplace_back(Arg->getType());
7698   }
7699   // Compute the return type (if any);
7700   Type ReturnType = IceType_void;
7701   Variable *Dest = Instr->getDest();
7702   if (Dest != nullptr)
7703     ReturnType = Dest->getType();
7704   return getShadowStoreSize<Traits>() +
7705          getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7706 }
7707 
7708 template <typename TraitsType>
7709 Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty,
7710                                                         RegNumT RegNum) {
7711   Variable *Reg = makeReg(Ty, RegNum);
7712   switch (Ty) {
7713   case IceType_i1:
7714   case IceType_i8:
7715   case IceType_i16:
7716   case IceType_i32:
7717   case IceType_i64:
7718     // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
7719     _mov(Reg, Ctx->getConstantZero(Ty));
7720     break;
7721   case IceType_f32:
7722   case IceType_f64:
7723     Context.insert<InstFakeDef>(Reg);
7724     _xorps(Reg, Reg);
7725     break;
7726   default:
7727     // All vector types use the same pxor instruction.
7728     assert(isVectorType(Ty));
7729     Context.insert<InstFakeDef>(Reg);
7730     _pxor(Reg, Reg);
7731     break;
7732   }
7733   return Reg;
7734 }
7735 
7736 // There is no support for loading or emitting vector constants, so the vector
7737 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
7738 // initialized with register operations.
7739 //
7740 // TODO(wala): Add limited support for vector constants so that complex
7741 // initialization in registers is unnecessary.
7742 
7743 template <typename TraitsType>
7744 Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty,
7745                                                        RegNumT RegNum) {
7746   return makeZeroedRegister(Ty, RegNum);
7747 }
7748 
7749 template <typename TraitsType>
7750 Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty,
7751                                                            RegNumT RegNum) {
7752   Variable *MinusOnes = makeReg(Ty, RegNum);
7753   // Insert a FakeDef so the live range of MinusOnes is not overestimated.
7754   Context.insert<InstFakeDef>(MinusOnes);
7755   if (Ty == IceType_f64)
7756     // Making a vector of minus ones of type f64 is currently only used for the
7757     // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
7758     // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
7759     // same job and only requires SSE2.
7760     _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
7761   else
7762     _pcmpeq(MinusOnes, MinusOnes);
7763   return MinusOnes;
7764 }
7765 
7766 template <typename TraitsType>
7767 Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
7768   Variable *Dest = makeVectorOfZeros(Ty, RegNum);
7769   Variable *MinusOne = makeVectorOfMinusOnes(Ty);
7770   _psub(Dest, MinusOne);
7771   return Dest;
7772 }
7773 
7774 template <typename TraitsType>
7775 Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty,
7776                                                                RegNumT RegNum) {
7777   assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
7778          Ty == IceType_v16i8);
7779   if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
7780     Variable *Reg = makeVectorOfOnes(Ty, RegNum);
7781     SizeT Shift =
7782         typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
7783     _psll(Reg, Ctx->getConstantInt8(Shift));
7784     return Reg;
7785   } else {
7786     // SSE has no left shift operation for vectors of 8 bit integers.
7787     constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
7788     Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
7789     Variable *Reg = makeReg(Ty, RegNum);
7790     _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
7791     _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
7792     return Reg;
7793   }
7794 }
7795 
7796 /// Construct a mask in a register that can be and'ed with a floating-point
7797 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
7798 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
7799 /// ones logically right shifted one bit.
7800 // TODO(stichnot): Fix the wala
7801 // TODO: above, to represent vector constants in memory.
7802 template <typename TraitsType>
7803 Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty,
7804                                                           RegNumT RegNum) {
7805   Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
7806   _psrl(Reg, Ctx->getConstantInt8(1));
7807   return Reg;
7808 }
7809 
7810 template <typename TraitsType>
7811 typename TargetX86Base<TraitsType>::X86OperandMem *
7812 TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
7813                                                         uint32_t Offset) {
7814   // Ensure that Loc is a stack slot.
7815   assert(Slot->mustNotHaveReg());
7816   assert(Slot->getRegNum().hasNoValue());
7817   // Compute the location of Loc in memory.
7818   // TODO(wala,stichnot): lea should not
7819   // be required. The address of the stack slot is known at compile time
7820   // (although not until after addProlog()).
7821   const Type PointerType = getPointerType();
7822   Variable *Loc = makeReg(PointerType);
7823   _lea(Loc, Slot);
7824   Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
7825   return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
7826 }
7827 
7828 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
7829 /// Src is assumed to already be legalized.  If the source operand is known to
7830 /// be a memory or immediate operand, a simple mov will suffice.  But if the
7831 /// source operand can be a physical register, then it must first be copied into
7832 /// a physical register that is truncable to 8-bit, then truncated into a
7833 /// physical register that can receive a truncation, and finally copied into the
7834 /// result 8-bit register (which in general can be any 8-bit register).  For
7835 /// example, moving %ebp into %ah may be accomplished as:
7836 ///   movl %ebp, %edx
7837 ///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
7838 ///   movb %dl, %ah
7839 /// On the other hand, moving a memory or immediate operand into ah:
7840 ///   movb 4(%ebp), %ah
7841 ///   movb $my_imm, %ah
7842 ///
7843 /// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
7844 /// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
7845 /// use RegNum=RegNumT() and then let the caller do a separate copy into
7846 /// Reg_ah.
7847 ///
7848 /// Note #2.  ConstantRelocatable operands are also put through this process
7849 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
7850 /// but not R_386_8 relocations.
7851 ///
7852 /// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
7853 /// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
7854 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
7855 /// to the pinsrb instruction.
7856 template <typename TraitsType>
7857 Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) {
7858   Type Ty = Src->getType();
7859   assert(isScalarIntegerType(Ty));
7860   assert(Ty != IceType_i1);
7861   Variable *Reg = makeReg(IceType_i8, RegNum);
7862   Reg->setRegClass(RCX86_IsTrunc8Rcvr);
7863   if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
7864     Variable *SrcTruncable = makeReg(Ty);
7865     switch (Ty) {
7866     case IceType_i64:
7867       SrcTruncable->setRegClass(RCX86_Is64To8);
7868       break;
7869     case IceType_i32:
7870       SrcTruncable->setRegClass(RCX86_Is32To8);
7871       break;
7872     case IceType_i16:
7873       SrcTruncable->setRegClass(RCX86_Is16To8);
7874       break;
7875     default:
7876       // i8 - just use default register class
7877       break;
7878     }
7879     Variable *SrcRcvr = makeReg(IceType_i8);
7880     SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
7881     _mov(SrcTruncable, Src);
7882     _mov(SrcRcvr, SrcTruncable);
7883     Src = SrcRcvr;
7884   }
7885   _mov(Reg, Src);
7886   return Reg;
7887 }
7888 
7889 /// Helper for legalize() to emit the right code to lower an operand to a
7890 /// register of the appropriate type.
7891 template <typename TraitsType>
7892 Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) {
7893   Type Ty = Src->getType();
7894   Variable *Reg = makeReg(Ty, RegNum);
7895   if (isVectorType(Ty)) {
7896     _movp(Reg, Src);
7897   } else {
7898     _mov(Reg, Src);
7899   }
7900   return Reg;
7901 }
7902 
7903 template <typename TraitsType>
7904 Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed,
7905                                              RegNumT RegNum) {
7906   const bool UseNonsfi = getFlags().getUseNonsfi();
7907   const Type Ty = From->getType();
7908   // Assert that a physical register is allowed. To date, all calls to
7909   // legalize() allow a physical register. If a physical register needs to be
7910   // explicitly disallowed, then new code will need to be written to force a
7911   // spill.
7912   assert(Allowed & Legal_Reg);
7913   // If we're asking for a specific physical register, make sure we're not
7914   // allowing any other operand kinds. (This could be future work, e.g. allow
7915   // the shl shift amount to be either an immediate or in ecx.)
7916   assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
7917 
7918   // Substitute with an available infinite-weight variable if possible.  Only do
7919   // this when we are not asking for a specific register, and when the
7920   // substitution is not locked to a specific register, and when the types
7921   // match, in order to capture the vast majority of opportunities and avoid
7922   // corner cases in the lowering.
7923   if (RegNum.hasNoValue()) {
7924     if (Variable *Subst = getContext().availabilityGet(From)) {
7925       // At this point we know there is a potential substitution available.
7926       if (Subst->mustHaveReg() && !Subst->hasReg()) {
7927         // At this point we know the substitution will have a register.
7928         if (From->getType() == Subst->getType()) {
7929           // At this point we know the substitution's register is compatible.
7930           return Subst;
7931         }
7932       }
7933     }
7934   }
7935 
7936   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
7937     // Before doing anything with a Mem operand, we need to ensure that the
7938     // Base and Index components are in physical registers.
7939     Variable *Base = Mem->getBase();
7940     Variable *Index = Mem->getIndex();
7941     Constant *Offset = Mem->getOffset();
7942     Variable *RegBase = nullptr;
7943     Variable *RegIndex = nullptr;
7944     uint16_t Shift = Mem->getShift();
7945     if (Base) {
7946       RegBase = llvm::cast<Variable>(
7947           legalize(Base, Legal_Reg | Legal_Rematerializable));
7948     }
7949     if (Index) {
7950       // TODO(jpp): perhaps we should only allow Legal_Reg if
7951       // Base->isRematerializable.
7952       RegIndex = llvm::cast<Variable>(
7953           legalize(Index, Legal_Reg | Legal_Rematerializable));
7954     }
7955 
7956     if (Base != RegBase || Index != RegIndex) {
7957       Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
7958                                   Mem->getSegmentRegister());
7959     }
7960 
7961     From = Mem;
7962 
7963     if (!(Allowed & Legal_Mem)) {
7964       From = copyToReg(From, RegNum);
7965     }
7966     return From;
7967   }
7968 
7969   if (auto *Const = llvm::dyn_cast<Constant>(From)) {
7970     if (llvm::isa<ConstantUndef>(Const)) {
7971       From = legalizeUndef(Const, RegNum);
7972       if (isVectorType(Ty))
7973         return From;
7974       Const = llvm::cast<Constant>(From);
7975     }
7976     // There should be no constants of vector type (other than undef).
7977     assert(!isVectorType(Ty));
7978 
7979     // If the operand is a 64 bit constant integer we need to legalize it to a
7980     // register in x86-64.
7981     if (Traits::Is64Bit) {
7982       if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
7983         if (!Utils::IsInt(32, C64->getValue())) {
7984           if (RegNum.hasValue()) {
7985             assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
7986           }
7987           return copyToReg(Const, RegNum);
7988         }
7989       }
7990     }
7991 
7992     if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Const)) {
7993       // If the operand is a ConstantRelocatable, and Legal_AddrAbs is not
7994       // specified, and UseNonsfi is indicated, we need to add RebasePtr.
7995       if (UseNonsfi && !(Allowed & Legal_AddrAbs)) {
7996         assert(Ty == IceType_i32);
7997         Variable *NewVar = makeReg(Ty, RegNum);
7998         auto *Mem = Traits::X86OperandMem::create(Func, Ty, nullptr, CR);
7999         // LEAs are not automatically sandboxed, thus we explicitly invoke
8000         // _sandbox_mem_reference.
8001         _lea(NewVar, _sandbox_mem_reference(Mem));
8002         From = NewVar;
8003       }
8004     } else if (isScalarFloatingType(Ty)) {
8005       // Convert a scalar floating point constant into an explicit memory
8006       // operand.
8007       if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
8008         if (Utils::isPositiveZero(ConstFloat->getValue()))
8009           return makeZeroedRegister(Ty, RegNum);
8010       } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
8011         if (Utils::isPositiveZero(ConstDouble->getValue()))
8012           return makeZeroedRegister(Ty, RegNum);
8013       }
8014 
8015       auto *CFrom = llvm::cast<Constant>(From);
8016       assert(CFrom->getShouldBePooled());
8017       Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
8018       auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
8019       From = Mem;
8020     }
8021 
8022     bool NeedsReg = false;
8023     if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
8024       // Immediate specifically not allowed.
8025       NeedsReg = true;
8026     if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
8027       // On x86, FP constants are lowered to mem operands.
8028       NeedsReg = true;
8029     if (NeedsReg) {
8030       From = copyToReg(From, RegNum);
8031     }
8032     return From;
8033   }
8034 
8035   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
8036     // Check if the variable is guaranteed a physical register. This can happen
8037     // either when the variable is pre-colored or when it is assigned infinite
8038     // weight.
8039     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
8040     bool MustRematerialize =
8041         (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
8042     // We need a new physical register for the operand if:
8043     // - Mem is not allowed and Var isn't guaranteed a physical register, or
8044     // - RegNum is required and Var->getRegNum() doesn't match, or
8045     // - Var is a rematerializable variable and rematerializable pass-through is
8046     //   not allowed (in which case we need a lea instruction).
8047     if (MustRematerialize) {
8048       Variable *NewVar = makeReg(Ty, RegNum);
8049       // Since Var is rematerializable, the offset will be added when the lea is
8050       // emitted.
8051       constexpr Constant *NoOffset = nullptr;
8052       auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
8053       _lea(NewVar, Mem);
8054       From = NewVar;
8055     } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
8056                (RegNum.hasValue() && RegNum != Var->getRegNum())) {
8057       From = copyToReg(From, RegNum);
8058     }
8059     return From;
8060   }
8061 
8062   llvm::report_fatal_error("Unhandled operand kind in legalize()");
8063   return From;
8064 }
8065 
8066 /// Provide a trivial wrapper to legalize() for this common usage.
8067 template <typename TraitsType>
8068 Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From,
8069                                                    RegNumT RegNum) {
8070   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
8071 }
8072 
8073 /// Legalize undef values to concrete values.
8074 template <typename TraitsType>
8075 Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From,
8076                                                   RegNumT RegNum) {
8077   Type Ty = From->getType();
8078   if (llvm::isa<ConstantUndef>(From)) {
8079     // Lower undefs to zero.  Another option is to lower undefs to an
8080     // uninitialized register; however, using an uninitialized register results
8081     // in less predictable code.
8082     //
8083     // If in the future the implementation is changed to lower undef values to
8084     // uninitialized registers, a FakeDef will be needed:
8085     //     Context.insert<InstFakeDef>(Reg);
8086     // This is in order to ensure that the live range of Reg is not
8087     // overestimated.  If the constant being lowered is a 64 bit value, then
8088     // the result should be split and the lo and hi components will need to go
8089     // in uninitialized registers.
8090     if (isVectorType(Ty))
8091       return makeVectorOfZeros(Ty, RegNum);
8092     return Ctx->getConstantZero(Ty);
8093   }
8094   return From;
8095 }
8096 
8097 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical
8098 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
8099 /// copied into a physical register. (Actually, either Src0 or Src1 can be
8100 /// chosen for the physical register, but unfortunately we have to commit to one
8101 /// or the other before register allocation.)
8102 template <typename TraitsType>
8103 Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0,
8104                                                        Operand *Src1) {
8105   bool IsSrc1ImmOrReg = false;
8106   if (llvm::isa<Constant>(Src1)) {
8107     IsSrc1ImmOrReg = true;
8108   } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
8109     if (Var->hasReg())
8110       IsSrc1ImmOrReg = true;
8111   }
8112   return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
8113 }
8114 
8115 template <typename TraitsType>
8116 typename TargetX86Base<TraitsType>::X86OperandMem *
8117 TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty,
8118                                              bool DoLegalize) {
8119   auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
8120   // It may be the case that address mode optimization already creates an
8121   // X86OperandMem, so in that case it wouldn't need another level of
8122   // transformation.
8123   if (!Mem) {
8124     auto *Base = llvm::dyn_cast<Variable>(Opnd);
8125     auto *Offset = llvm::dyn_cast<Constant>(Opnd);
8126     assert(Base || Offset);
8127     if (Offset) {
8128       if (!llvm::isa<ConstantRelocatable>(Offset)) {
8129         if (llvm::isa<ConstantInteger64>(Offset)) {
8130           // Memory operands cannot have 64-bit immediates, so they must be
8131           // legalized into a register only.
8132           Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
8133           Offset = nullptr;
8134         } else {
8135           Offset = llvm::cast<Constant>(legalize(Offset));
8136 
8137           assert(llvm::isa<ConstantInteger32>(Offset) ||
8138                  llvm::isa<ConstantRelocatable>(Offset));
8139         }
8140       }
8141     }
8142     Mem = X86OperandMem::create(Func, Ty, Base, Offset);
8143   }
8144   return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
8145 }
8146 
8147 template <typename TraitsType>
8148 Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) {
8149   // There aren't any 64-bit integer registers for x86-32.
8150   assert(Traits::Is64Bit || Type != IceType_i64);
8151   Variable *Reg = Func->makeVariable(Type);
8152   if (RegNum.hasValue())
8153     Reg->setRegNum(RegNum);
8154   else
8155     Reg->setMustHaveReg();
8156   return Reg;
8157 }
8158 
8159 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
8160                             IceType_v16i8};
8161 
8162 template <typename TraitsType>
8163 Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size,
8164                                                   uint32_t MaxSize) {
8165   assert(Size != 0);
8166   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
8167   uint32_t MaxIndex = MaxSize == NoSizeLimit
8168                           ? llvm::array_lengthof(TypeForSize) - 1
8169                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
8170   return TypeForSize[std::min(TyIndex, MaxIndex)];
8171 }
8172 
8173 template <typename TraitsType>
8174 Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size,
8175                                                       uint32_t MaxSize) {
8176   assert(Size != 0);
8177   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
8178   if (!llvm::isPowerOf2_32(Size))
8179     ++TyIndex;
8180   uint32_t MaxIndex = MaxSize == NoSizeLimit
8181                           ? llvm::array_lengthof(TypeForSize) - 1
8182                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
8183   return TypeForSize[std::min(TyIndex, MaxIndex)];
8184 }
8185 
8186 template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() {
8187   if (Func->getOptLevel() == Opt_m1)
8188     return;
8189   markRedefinitions();
8190   Context.availabilityUpdate();
8191 }
8192 
8193 template <typename TraitsType>
8194 void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const {
8195   if (!BuildDefs::dump())
8196     return;
8197   Ostream &Str = Ctx->getStrEmit();
8198   Str << "$" << C->getValue();
8199 }
8200 
8201 template <typename TraitsType>
8202 void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const {
8203   if (!Traits::Is64Bit) {
8204     llvm::report_fatal_error("Not expecting to emit 64-bit integers");
8205   } else {
8206     if (!BuildDefs::dump())
8207       return;
8208     Ostream &Str = Ctx->getStrEmit();
8209     Str << "$" << C->getValue();
8210   }
8211 }
8212 
8213 template <typename TraitsType>
8214 void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const {
8215   if (!BuildDefs::dump())
8216     return;
8217   Ostream &Str = Ctx->getStrEmit();
8218   Str << C->getLabelName();
8219 }
8220 
8221 template <typename TraitsType>
8222 void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const {
8223   if (!BuildDefs::dump())
8224     return;
8225   Ostream &Str = Ctx->getStrEmit();
8226   Str << C->getLabelName();
8227 }
8228 
8229 template <typename TraitsType>
8230 void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const {
8231   llvm::report_fatal_error("undef value encountered by emitter.");
8232 }
8233 
8234 template <class Machine>
8235 void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const {
8236   if (!BuildDefs::dump())
8237     return;
8238   assert(!getFlags().getUseNonsfi() ||
8239          C->getName().toString() == GlobalOffsetTable);
8240   Ostream &Str = Ctx->getStrEmit();
8241   Str << "$";
8242   emitWithoutPrefix(C);
8243 }
8244 
8245 template <typename TraitsType>
8246 void TargetX86Base<TraitsType>::emitJumpTable(
8247     const Cfg *, const InstJumpTable *JumpTable) const {
8248   if (!BuildDefs::dump())
8249     return;
8250   Ostream &Str = Ctx->getStrEmit();
8251   const bool UseNonsfi = getFlags().getUseNonsfi();
8252   const char *Prefix = UseNonsfi ? ".data.rel.ro." : ".rodata.";
8253   Str << "\t.section\t" << Prefix << JumpTable->getSectionName()
8254       << ",\"a\",@progbits\n"
8255          "\t.align\t"
8256       << typeWidthInBytes(getPointerType()) << "\n"
8257       << JumpTable->getName() << ":";
8258 
8259   // On X86 ILP32 pointers are 32-bit hence the use of .long
8260   for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
8261     Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName();
8262   Str << "\n";
8263 }
8264 
8265 template <typename TraitsType>
8266 template <typename T>
8267 void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) {
8268   if (!BuildDefs::dump())
8269     return;
8270   Ostream &Str = Ctx->getStrEmit();
8271   Type Ty = T::Ty;
8272   SizeT Align = typeAlignInBytes(Ty);
8273   ConstantList Pool = Ctx->getConstantPool(Ty);
8274 
8275   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
8276       << "\n";
8277   Str << "\t.align\t" << Align << "\n";
8278 
8279   for (Constant *C : Pool) {
8280     if (!C->getShouldBePooled())
8281       continue;
8282     auto *Const = llvm::cast<typename T::IceType>(C);
8283     typename T::IceType::PrimType Value = Const->getValue();
8284     // Use memcpy() to copy bits from Value into RawValue in a way that avoids
8285     // breaking strict-aliasing rules.
8286     typename T::PrimitiveIntType RawValue;
8287     memcpy(&RawValue, &Value, sizeof(Value));
8288     char buf[30];
8289     int CharsPrinted =
8290         snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
8291     assert(CharsPrinted >= 0);
8292     assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
8293     (void)CharsPrinted; // avoid warnings if asserts are disabled
8294     Str << Const->getLabelName();
8295     Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
8296         << Value << " */\n";
8297   }
8298 }
8299 
8300 template <typename TraitsType>
8301 void TargetDataX86<TraitsType>::lowerConstants() {
8302   if (getFlags().getDisableTranslation())
8303     return;
8304   switch (getFlags().getOutFileType()) {
8305   case FT_Elf: {
8306     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8307 
8308     Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
8309     Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
8310     Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
8311 
8312     Writer->writeConstantPool<ConstantFloat>(IceType_f32);
8313     Writer->writeConstantPool<ConstantDouble>(IceType_f64);
8314   } break;
8315   case FT_Asm:
8316   case FT_Iasm: {
8317     OstreamLocker L(Ctx);
8318 
8319     emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
8320     emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
8321     emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
8322 
8323     emitConstantPool<PoolTypeConverter<float>>(Ctx);
8324     emitConstantPool<PoolTypeConverter<double>>(Ctx);
8325   } break;
8326   }
8327 }
8328 
8329 template <typename TraitsType>
8330 void TargetDataX86<TraitsType>::lowerJumpTables() {
8331   const bool IsPIC = getFlags().getUseNonsfi();
8332   switch (getFlags().getOutFileType()) {
8333   case FT_Elf: {
8334     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8335     constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
8336     const FixupKind RelocationKind =
8337         (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64;
8338     for (const JumpTableData &JT : Ctx->getJumpTables())
8339       Writer->writeJumpTable(JT, RelocationKind, IsPIC);
8340   } break;
8341   case FT_Asm:
8342     // Already emitted from Cfg
8343     break;
8344   case FT_Iasm: {
8345     if (!BuildDefs::dump())
8346       return;
8347     Ostream &Str = Ctx->getStrEmit();
8348     const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
8349     for (const JumpTableData &JT : Ctx->getJumpTables()) {
8350       Str << "\t.section\t" << Prefix << JT.getSectionName()
8351           << ",\"a\",@progbits\n"
8352              "\t.align\t"
8353           << typeWidthInBytes(getPointerType()) << "\n"
8354           << JT.getName().toString() << ":";
8355 
8356       // On X8664 ILP32 pointers are 32-bit hence the use of .long
8357       for (intptr_t TargetOffset : JT.getTargetOffsets())
8358         Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
8359       Str << "\n";
8360     }
8361   } break;
8362   }
8363 }
8364 
8365 template <typename TraitsType>
8366 void TargetDataX86<TraitsType>::lowerGlobals(
8367     const VariableDeclarationList &Vars, const std::string &SectionSuffix) {
8368   const bool IsPIC = getFlags().getUseNonsfi();
8369   switch (getFlags().getOutFileType()) {
8370   case FT_Elf: {
8371     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8372     Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
8373   } break;
8374   case FT_Asm:
8375   case FT_Iasm: {
8376     OstreamLocker L(Ctx);
8377     for (const VariableDeclaration *Var : Vars) {
8378       if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
8379         emitGlobal(*Var, SectionSuffix);
8380       }
8381     }
8382   } break;
8383   }
8384 }
8385 } // end of namespace X86NAMESPACE
8386 } // end of namespace Ice
8387 
8388 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
8389