1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// 2 // 3 // The Subzero Code Generator 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Implements the TargetLoweringX86Base class, which consists almost 12 /// entirely of the lowering sequence for each high-level instruction. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 18 19 #include "IceCfg.h" 20 #include "IceCfgNode.h" 21 #include "IceClFlags.h" 22 #include "IceDefs.h" 23 #include "IceELFObjectWriter.h" 24 #include "IceGlobalInits.h" 25 #include "IceInstVarIter.h" 26 #include "IceInstX86Base.h" 27 #include "IceLiveness.h" 28 #include "IceOperand.h" 29 #include "IcePhiLoweringImpl.h" 30 #include "IceUtils.h" 31 #include "IceVariableSplitting.h" 32 33 #include "llvm/Support/MathExtras.h" 34 35 #include <stack> 36 37 namespace Ice { 38 namespace X86 { 39 template <typename T> struct PoolTypeConverter {}; 40 41 template <> struct PoolTypeConverter<float> { 42 using PrimitiveIntType = uint32_t; 43 using IceType = ConstantFloat; 44 static const Type Ty = IceType_f32; 45 static const char *TypeName; 46 static const char *AsmTag; 47 static const char *PrintfString; 48 }; 49 50 template <> struct PoolTypeConverter<double> { 51 using PrimitiveIntType = uint64_t; 52 using IceType = ConstantDouble; 53 static const Type Ty = IceType_f64; 54 static const char *TypeName; 55 static const char *AsmTag; 56 static const char *PrintfString; 57 }; 58 59 // Add converter for int type constant pooling 60 template <> struct PoolTypeConverter<uint32_t> { 61 using PrimitiveIntType = uint32_t; 62 using IceType = ConstantInteger32; 63 static const Type Ty = IceType_i32; 64 static const char *TypeName; 65 static const char *AsmTag; 66 static const char *PrintfString; 67 }; 68 69 // Add converter for int type constant pooling 70 template <> struct PoolTypeConverter<uint16_t> { 71 using PrimitiveIntType = uint32_t; 72 using IceType = ConstantInteger32; 73 static const Type Ty = IceType_i16; 74 static const char *TypeName; 75 static const char *AsmTag; 76 static const char *PrintfString; 77 }; 78 79 // Add converter for int type constant pooling 80 template <> struct PoolTypeConverter<uint8_t> { 81 using PrimitiveIntType = uint32_t; 82 using IceType = ConstantInteger32; 83 static const Type Ty = IceType_i8; 84 static const char *TypeName; 85 static const char *AsmTag; 86 static const char *PrintfString; 87 }; 88 } // end of namespace X86 89 90 namespace X86NAMESPACE { 91 92 // The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte 93 // "shadow store" (aka "home space") so that the callee may copy the 4 94 // register args to it. 95 template <typename Traits> SizeT getShadowStoreSize() { 96 #if defined(SUBZERO_USE_MICROSOFT_ABI) 97 static const SizeT ShadowStoreSize = 98 Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0; 99 return ShadowStoreSize; 100 #else 101 return 0; 102 #endif 103 } 104 105 using Utils::BoolFlagSaver; 106 107 template <typename Traits> class BoolFoldingEntry { 108 BoolFoldingEntry(const BoolFoldingEntry &) = delete; 109 110 public: 111 BoolFoldingEntry() = default; 112 explicit BoolFoldingEntry(Inst *I); 113 BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default; 114 /// Instr is the instruction producing the i1-type variable of interest. 115 Inst *Instr = nullptr; 116 /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr). 117 bool IsComplex = false; 118 /// IsLiveOut is initialized conservatively to true, and is set to false when 119 /// we encounter an instruction that ends Var's live range. We disable the 120 /// folding optimization when Var is live beyond this basic block. Note that 121 /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will 122 /// always be true and the folding optimization will never be performed. 123 bool IsLiveOut = true; 124 // NumUses counts the number of times Var is used as a source operand in the 125 // basic block. If IsComplex is true and there is more than one use of Var, 126 // then the folding optimization is disabled for Var. 127 uint32_t NumUses = 0; 128 }; 129 130 template <typename Traits> class BoolFolding { 131 public: 132 enum BoolFoldingProducerKind { 133 PK_None, 134 // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative. 135 PK_Icmp32, 136 PK_Icmp64, 137 PK_Fcmp, 138 PK_Trunc, 139 PK_Arith // A flag-setting arithmetic instruction. 140 }; 141 142 /// Currently the actual enum values are not used (other than CK_None), but we 143 /// go ahead and produce them anyway for symmetry with the 144 /// BoolFoldingProducerKind. 145 enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext }; 146 147 private: 148 BoolFolding(const BoolFolding &) = delete; 149 BoolFolding &operator=(const BoolFolding &) = delete; 150 151 public: 152 BoolFolding() = default; 153 static BoolFoldingProducerKind getProducerKind(const Inst *Instr); 154 static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr); 155 static bool hasComplexLowering(const Inst *Instr); 156 static bool isValidFolding(BoolFoldingProducerKind ProducerKind, 157 BoolFoldingConsumerKind ConsumerKind); 158 void init(CfgNode *Node); 159 const Inst *getProducerFor(const Operand *Opnd) const; 160 void dump(const Cfg *Func) const; 161 162 private: 163 /// Returns true if Producers contains a valid entry for the given VarNum. 164 bool containsValid(SizeT VarNum) const { 165 auto Element = Producers.find(VarNum); 166 return Element != Producers.end() && Element->second.Instr != nullptr; 167 } 168 void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; } 169 void invalidateProducersOnStore(const Inst *Instr); 170 /// Producers maps Variable::Number to a BoolFoldingEntry. 171 CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers; 172 }; 173 174 template <typename Traits> 175 BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I) 176 : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {} 177 178 template <typename Traits> 179 typename BoolFolding<Traits>::BoolFoldingProducerKind 180 BoolFolding<Traits>::getProducerKind(const Inst *Instr) { 181 if (llvm::isa<InstIcmp>(Instr)) { 182 if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64) 183 return PK_Icmp32; 184 return PK_Icmp64; 185 } 186 if (llvm::isa<InstFcmp>(Instr)) 187 return PK_Fcmp; 188 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) { 189 if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) { 190 switch (Arith->getOp()) { 191 default: 192 return PK_None; 193 case InstArithmetic::And: 194 case InstArithmetic::Or: 195 return PK_Arith; 196 } 197 } 198 } 199 return PK_None; // TODO(stichnot): remove this 200 201 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) { 202 switch (Cast->getCastKind()) { 203 default: 204 return PK_None; 205 case InstCast::Trunc: 206 return PK_Trunc; 207 } 208 } 209 return PK_None; 210 } 211 212 template <typename Traits> 213 typename BoolFolding<Traits>::BoolFoldingConsumerKind 214 BoolFolding<Traits>::getConsumerKind(const Inst *Instr) { 215 if (llvm::isa<InstBr>(Instr)) 216 return CK_Br; 217 if (llvm::isa<InstSelect>(Instr)) 218 return CK_Select; 219 return CK_None; // TODO(stichnot): remove this 220 221 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) { 222 switch (Cast->getCastKind()) { 223 default: 224 return CK_None; 225 case InstCast::Sext: 226 return CK_Sext; 227 case InstCast::Zext: 228 return CK_Zext; 229 } 230 } 231 return CK_None; 232 } 233 234 /// Returns true if the producing instruction has a "complex" lowering sequence. 235 /// This generally means that its lowering sequence requires more than one 236 /// conditional branch, namely 64-bit integer compares and some floating-point 237 /// compares. When this is true, and there is more than one consumer, we prefer 238 /// to disable the folding optimization because it minimizes branches. 239 template <typename Traits> 240 bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) { 241 switch (getProducerKind(Instr)) { 242 default: 243 return false; 244 case PK_Icmp64: 245 return !Traits::Is64Bit; 246 case PK_Fcmp: 247 return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 != 248 Traits::Cond::Br_None; 249 } 250 } 251 252 template <typename Traits> 253 bool BoolFolding<Traits>::isValidFolding( 254 typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind, 255 typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) { 256 switch (ProducerKind) { 257 default: 258 return false; 259 case PK_Icmp32: 260 case PK_Icmp64: 261 case PK_Fcmp: 262 return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select); 263 case PK_Arith: 264 return ConsumerKind == CK_Br; 265 } 266 } 267 268 template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) { 269 Producers.clear(); 270 for (Inst &Instr : Node->getInsts()) { 271 if (Instr.isDeleted()) 272 continue; 273 invalidateProducersOnStore(&Instr); 274 // Check whether Instr is a valid producer. 275 Variable *Var = Instr.getDest(); 276 if (Var) { // only consider instructions with an actual dest var 277 if (isBooleanType(Var->getType())) { // only bool-type dest vars 278 if (getProducerKind(&Instr) != PK_None) { // white-listed instructions 279 Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr); 280 } 281 } 282 } 283 // Check each src variable against the map. 284 FOREACH_VAR_IN_INST(Var, Instr) { 285 SizeT VarNum = Var->getIndex(); 286 if (!containsValid(VarNum)) 287 continue; 288 // All valid consumers use Var as the first source operand 289 if (IndexOfVarOperandInInst(Var) != 0) { 290 setInvalid(VarNum); 291 continue; 292 } 293 // Consumer instructions must be white-listed 294 typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind = 295 getConsumerKind(&Instr); 296 if (ConsumerKind == CK_None) { 297 setInvalid(VarNum); 298 continue; 299 } 300 typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind = 301 getProducerKind(Producers[VarNum].Instr); 302 if (!isValidFolding(ProducerKind, ConsumerKind)) { 303 setInvalid(VarNum); 304 continue; 305 } 306 // Avoid creating multiple copies of complex producer instructions. 307 if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) { 308 setInvalid(VarNum); 309 continue; 310 } 311 ++Producers[VarNum].NumUses; 312 if (Instr.isLastUse(Var)) { 313 Producers[VarNum].IsLiveOut = false; 314 } 315 } 316 } 317 for (auto &I : Producers) { 318 // Ignore entries previously marked invalid. 319 if (I.second.Instr == nullptr) 320 continue; 321 // Disable the producer if its dest may be live beyond this block. 322 if (I.second.IsLiveOut) { 323 setInvalid(I.first); 324 continue; 325 } 326 // Mark as "dead" rather than outright deleting. This is so that other 327 // peephole style optimizations during or before lowering have access to 328 // this instruction in undeleted form. See for example 329 // tryOptimizedCmpxchgCmpBr(). 330 I.second.Instr->setDead(); 331 } 332 } 333 334 template <typename Traits> 335 const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const { 336 auto *Var = llvm::dyn_cast<const Variable>(Opnd); 337 if (Var == nullptr) 338 return nullptr; 339 SizeT VarNum = Var->getIndex(); 340 auto Element = Producers.find(VarNum); 341 if (Element == Producers.end()) 342 return nullptr; 343 return Element->second.Instr; 344 } 345 346 template <typename Traits> 347 void BoolFolding<Traits>::dump(const Cfg *Func) const { 348 if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding)) 349 return; 350 OstreamLocker L(Func->getContext()); 351 Ostream &Str = Func->getContext()->getStrDump(); 352 for (auto &I : Producers) { 353 if (I.second.Instr == nullptr) 354 continue; 355 Str << "Found foldable producer:\n "; 356 I.second.Instr->dump(Func); 357 Str << "\n"; 358 } 359 } 360 361 /// If the given instruction has potential memory side effects (e.g. store, rmw, 362 /// or a call instruction with potential memory side effects), then we must not 363 /// allow a pre-store Producer instruction with memory operands to be folded 364 /// into a post-store Consumer instruction. If this is detected, the Producer 365 /// is invalidated. 366 /// 367 /// We use the Producer's IsLiveOut field to determine whether any potential 368 /// Consumers come after this store instruction. The IsLiveOut field is 369 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it 370 /// sees the variable's definitive last use (indicating the variable is not in 371 /// the node's live-out set). Thus if we see here that IsLiveOut is false, we 372 /// know that there can be no consumers after the store, and therefore we know 373 /// the folding is safe despite the store instruction. 374 template <typename Traits> 375 void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) { 376 if (!Instr->isMemoryWrite()) 377 return; 378 for (auto &ProducerPair : Producers) { 379 if (!ProducerPair.second.IsLiveOut) 380 continue; 381 Inst *PInst = ProducerPair.second.Instr; 382 if (PInst == nullptr) 383 continue; 384 bool HasMemOperand = false; 385 const SizeT SrcSize = PInst->getSrcSize(); 386 for (SizeT I = 0; I < SrcSize; ++I) { 387 if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) { 388 HasMemOperand = true; 389 break; 390 } 391 } 392 if (!HasMemOperand) 393 continue; 394 setInvalid(ProducerPair.first); 395 } 396 } 397 398 template <typename TraitsType> 399 void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) { 400 FoldingInfo.init(Node); 401 FoldingInfo.dump(Func); 402 } 403 404 template <typename TraitsType> 405 TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func) 406 : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl) { 407 static_assert( 408 (Traits::InstructionSet::End - Traits::InstructionSet::Begin) == 409 (TargetInstructionSet::X86InstructionSet_End - 410 TargetInstructionSet::X86InstructionSet_Begin), 411 "Traits::InstructionSet range different from TargetInstructionSet"); 412 if (getFlags().getTargetInstructionSet() != 413 TargetInstructionSet::BaseInstructionSet) { 414 InstructionSet = static_cast<InstructionSetEnum>( 415 (getFlags().getTargetInstructionSet() - 416 TargetInstructionSet::X86InstructionSet_Begin) + 417 Traits::InstructionSet::Begin); 418 } 419 } 420 421 template <typename TraitsType> 422 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) { 423 RegNumT::setLimit(Traits::RegisterSet::Reg_NUM); 424 Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases); 425 for (size_t i = 0; i < TypeToRegisterSet.size(); ++i) 426 TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i]; 427 filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM, 428 TypeToRegisterSet.data(), TypeToRegisterSet.size(), 429 Traits::getRegName, getRegClassName); 430 PcRelFixup = Traits::FK_PcRel; 431 AbsFixup = getFlags().getUseNonsfi() ? Traits::FK_Gotoff : Traits::FK_Abs; 432 } 433 434 template <typename TraitsType> 435 bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) { 436 if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) { 437 return !Utils::isPositiveZero(ConstFloat->getValue()); 438 } 439 if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) { 440 return !Utils::isPositiveZero(ConstDouble->getValue()); 441 } 442 return false; 443 } 444 445 template <typename TraitsType> 446 ::Ice::Type TargetX86Base<TraitsType>::getPointerType() { 447 if (!Traits::Is64Bit || 448 ::Ice::getFlags().getApplicationBinaryInterface() == ::Ice::ABI_PNaCl) { 449 return ::Ice::IceType_i32; 450 } 451 return ::Ice::IceType_i64; 452 } 453 454 template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() { 455 TimerMarker T(TimerStack::TT_O2, Func); 456 457 if (SandboxingType != ST_None) { 458 initRebasePtr(); 459 } 460 461 genTargetHelperCalls(); 462 Func->dump("After target helper call insertion"); 463 464 // Merge Alloca instructions, and lay out the stack. 465 static constexpr bool SortAndCombineAllocas = true; 466 Func->processAllocas(SortAndCombineAllocas); 467 Func->dump("After Alloca processing"); 468 469 // Run this early so it can be used to focus optimizations on potentially hot 470 // code. 471 // TODO(stichnot,ascull): currently only used for regalloc not 472 // expensive high level optimizations which could be focused on potentially 473 // hot code. 474 Func->generateLoopInfo(); 475 Func->dump("After loop analysis"); 476 if (getFlags().getLoopInvariantCodeMotion()) { 477 Func->loopInvariantCodeMotion(); 478 Func->dump("After LICM"); 479 } 480 481 if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) { 482 Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA); 483 Func->dump("After Local CSE"); 484 Func->floatConstantCSE(); 485 } 486 if (getFlags().getEnableShortCircuit()) { 487 Func->shortCircuitJumps(); 488 Func->dump("After Short Circuiting"); 489 } 490 491 if (!getFlags().getEnablePhiEdgeSplit()) { 492 // Lower Phi instructions. 493 Func->placePhiLoads(); 494 if (Func->hasError()) 495 return; 496 Func->placePhiStores(); 497 if (Func->hasError()) 498 return; 499 Func->deletePhis(); 500 if (Func->hasError()) 501 return; 502 Func->dump("After Phi lowering"); 503 } 504 505 // Address mode optimization. 506 Func->getVMetadata()->init(VMK_SingleDefs); 507 Func->doAddressOpt(); 508 Func->materializeVectorShuffles(); 509 510 // Find read-modify-write opportunities. Do this after address mode 511 // optimization so that doAddressOpt() doesn't need to be applied to RMW 512 // instructions as well. 513 findRMW(); 514 Func->dump("After RMW transform"); 515 516 // Argument lowering 517 Func->doArgLowering(); 518 519 // Target lowering. This requires liveness analysis for some parts of the 520 // lowering decisions, such as compare/branch fusing. If non-lightweight 521 // liveness analysis is used, the instructions need to be renumbered first 522 // TODO: This renumbering should only be necessary if we're actually 523 // calculating live intervals, which we only do for register allocation. 524 Func->renumberInstructions(); 525 if (Func->hasError()) 526 return; 527 528 // TODO: It should be sufficient to use the fastest liveness calculation, 529 // i.e. livenessLightweight(). However, for some reason that slows down the 530 // rest of the translation. Investigate. 531 Func->liveness(Liveness_Basic); 532 if (Func->hasError()) 533 return; 534 Func->dump("After x86 address mode opt"); 535 536 doLoadOpt(); 537 538 Func->genCode(); 539 if (Func->hasError()) 540 return; 541 if (SandboxingType != ST_None) { 542 initSandbox(); 543 } 544 Func->dump("After x86 codegen"); 545 splitBlockLocalVariables(Func); 546 547 // Register allocation. This requires instruction renumbering and full 548 // liveness analysis. Loops must be identified before liveness so variable 549 // use weights are correct. 550 Func->renumberInstructions(); 551 if (Func->hasError()) 552 return; 553 Func->liveness(Liveness_Intervals); 554 if (Func->hasError()) 555 return; 556 // The post-codegen dump is done here, after liveness analysis and associated 557 // cleanup, to make the dump cleaner and more useful. 558 Func->dump("After initial x86 codegen"); 559 // Validate the live range computations. The expensive validation call is 560 // deliberately only made when assertions are enabled. 561 assert(Func->validateLiveness()); 562 Func->getVMetadata()->init(VMK_All); 563 regAlloc(RAK_Global); 564 if (Func->hasError()) 565 return; 566 Func->dump("After linear scan regalloc"); 567 568 if (getFlags().getEnablePhiEdgeSplit()) { 569 Func->advancedPhiLowering(); 570 Func->dump("After advanced Phi lowering"); 571 } 572 573 // Stack frame mapping. 574 Func->genFrame(); 575 if (Func->hasError()) 576 return; 577 Func->dump("After stack frame mapping"); 578 579 Func->contractEmptyNodes(); 580 Func->reorderNodes(); 581 582 // Branch optimization. This needs to be done just before code emission. In 583 // particular, no transformations that insert or reorder CfgNodes should be 584 // done after branch optimization. We go ahead and do it before nop insertion 585 // to reduce the amount of work needed for searching for opportunities. 586 Func->doBranchOpt(); 587 Func->dump("After branch optimization"); 588 589 // Mark nodes that require sandbox alignment 590 if (NeedSandboxing) { 591 Func->markNodesForSandboxing(); 592 } 593 } 594 595 template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() { 596 TimerMarker T(TimerStack::TT_Om1, Func); 597 598 if (SandboxingType != ST_None) { 599 initRebasePtr(); 600 } 601 602 genTargetHelperCalls(); 603 604 // Do not merge Alloca instructions, and lay out the stack. 605 // static constexpr bool SortAndCombineAllocas = false; 606 static constexpr bool SortAndCombineAllocas = 607 true; // TODO(b/171222930): Fix Win32 bug when this is false 608 Func->processAllocas(SortAndCombineAllocas); 609 Func->dump("After Alloca processing"); 610 611 Func->placePhiLoads(); 612 if (Func->hasError()) 613 return; 614 Func->placePhiStores(); 615 if (Func->hasError()) 616 return; 617 Func->deletePhis(); 618 if (Func->hasError()) 619 return; 620 Func->dump("After Phi lowering"); 621 622 Func->doArgLowering(); 623 Func->genCode(); 624 if (Func->hasError()) 625 return; 626 if (SandboxingType != ST_None) { 627 initSandbox(); 628 } 629 Func->dump("After initial x86 codegen"); 630 631 regAlloc(RAK_InfOnly); 632 if (Func->hasError()) 633 return; 634 Func->dump("After regalloc of infinite-weight variables"); 635 636 Func->genFrame(); 637 if (Func->hasError()) 638 return; 639 Func->dump("After stack frame mapping"); 640 641 // Mark nodes that require sandbox alignment 642 if (NeedSandboxing) 643 Func->markNodesForSandboxing(); 644 } 645 646 inline bool canRMW(const InstArithmetic *Arith) { 647 Type Ty = Arith->getDest()->getType(); 648 // X86 vector instructions write to a register and have no RMW option. 649 if (isVectorType(Ty)) 650 return false; 651 bool isI64 = Ty == IceType_i64; 652 653 switch (Arith->getOp()) { 654 // Not handled for lack of simple lowering: 655 // shift on i64 656 // mul, udiv, urem, sdiv, srem, frem 657 // Not handled for lack of RMW instructions: 658 // fadd, fsub, fmul, fdiv (also vector types) 659 default: 660 return false; 661 case InstArithmetic::Add: 662 case InstArithmetic::Sub: 663 case InstArithmetic::And: 664 case InstArithmetic::Or: 665 case InstArithmetic::Xor: 666 return true; 667 case InstArithmetic::Shl: 668 case InstArithmetic::Lshr: 669 case InstArithmetic::Ashr: 670 return false; // TODO(stichnot): implement 671 return !isI64; 672 } 673 } 674 675 template <typename TraitsType> 676 bool isSameMemAddressOperand(const Operand *A, const Operand *B) { 677 if (A == B) 678 return true; 679 if (auto *MemA = 680 llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>( 681 A)) { 682 if (auto *MemB = 683 llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>( 684 B)) { 685 return MemA->getBase() == MemB->getBase() && 686 MemA->getOffset() == MemB->getOffset() && 687 MemA->getIndex() == MemB->getIndex() && 688 MemA->getShift() == MemB->getShift() && 689 MemA->getSegmentRegister() == MemB->getSegmentRegister(); 690 } 691 } 692 return false; 693 } 694 695 template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() { 696 TimerMarker _(TimerStack::TT_findRMW, Func); 697 Func->dump("Before RMW"); 698 if (Func->isVerbose(IceV_RMW)) 699 Func->getContext()->lockStr(); 700 for (CfgNode *Node : Func->getNodes()) { 701 // Walk through the instructions, considering each sequence of 3 702 // instructions, and look for the particular RMW pattern. Note that this 703 // search can be "broken" (false negatives) if there are intervening 704 // deleted instructions, or intervening instructions that could be safely 705 // moved out of the way to reveal an RMW pattern. 706 auto E = Node->getInsts().end(); 707 auto I1 = E, I2 = E, I3 = Node->getInsts().begin(); 708 for (; I3 != E; I1 = I2, I2 = I3, ++I3) { 709 // Make I3 skip over deleted instructions. 710 while (I3 != E && I3->isDeleted()) 711 ++I3; 712 if (I1 == E || I2 == E || I3 == E) 713 continue; 714 assert(!I1->isDeleted()); 715 assert(!I2->isDeleted()); 716 assert(!I3->isDeleted()); 717 auto *Load = llvm::dyn_cast<InstLoad>(I1); 718 auto *Arith = llvm::dyn_cast<InstArithmetic>(I2); 719 auto *Store = llvm::dyn_cast<InstStore>(I3); 720 if (!Load || !Arith || !Store) 721 continue; 722 // Look for: 723 // a = Load addr 724 // b = <op> a, other 725 // Store b, addr 726 // Change to: 727 // a = Load addr 728 // b = <op> a, other 729 // x = FakeDef 730 // RMW <op>, addr, other, x 731 // b = Store b, addr, x 732 // Note that inferTwoAddress() makes sure setDestRedefined() gets called 733 // on the updated Store instruction, to avoid liveness problems later. 734 // 735 // With this transformation, the Store instruction acquires a Dest 736 // variable and is now subject to dead code elimination if there are no 737 // more uses of "b". Variable "x" is a beacon for determining whether the 738 // Store instruction gets dead-code eliminated. If the Store instruction 739 // is eliminated, then it must be the case that the RMW instruction ends 740 // x's live range, and therefore the RMW instruction will be retained and 741 // later lowered. On the other hand, if the RMW instruction does not end 742 // x's live range, then the Store instruction must still be present, and 743 // therefore the RMW instruction is ignored during lowering because it is 744 // redundant with the Store instruction. 745 // 746 // Note that if "a" has further uses, the RMW transformation may still 747 // trigger, resulting in two loads and one store, which is worse than the 748 // original one load and one store. However, this is probably rare, and 749 // caching probably keeps it just as fast. 750 if (!isSameMemAddressOperand<TraitsType>(Load->getLoadAddress(), 751 Store->getStoreAddress())) 752 continue; 753 Operand *ArithSrcFromLoad = Arith->getSrc(0); 754 Operand *ArithSrcOther = Arith->getSrc(1); 755 if (ArithSrcFromLoad != Load->getDest()) { 756 if (!Arith->isCommutative() || ArithSrcOther != Load->getDest()) 757 continue; 758 std::swap(ArithSrcFromLoad, ArithSrcOther); 759 } 760 if (Arith->getDest() != Store->getData()) 761 continue; 762 if (!canRMW(Arith)) 763 continue; 764 if (Func->isVerbose(IceV_RMW)) { 765 Ostream &Str = Func->getContext()->getStrDump(); 766 Str << "Found RMW in " << Func->getFunctionName() << ":\n "; 767 Load->dump(Func); 768 Str << "\n "; 769 Arith->dump(Func); 770 Str << "\n "; 771 Store->dump(Func); 772 Str << "\n"; 773 } 774 Variable *Beacon = Func->makeVariable(IceType_i32); 775 Beacon->setMustNotHaveReg(); 776 Store->setRmwBeacon(Beacon); 777 auto *BeaconDef = InstFakeDef::create(Func, Beacon); 778 Node->getInsts().insert(I3, BeaconDef); 779 auto *RMW = 780 InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(), 781 Beacon, Arith->getOp()); 782 Node->getInsts().insert(I3, RMW); 783 } 784 } 785 if (Func->isVerbose(IceV_RMW)) 786 Func->getContext()->unlockStr(); 787 } 788 789 // Converts a ConstantInteger32 operand into its constant value, or 790 // MemoryOrderInvalid if the operand is not a ConstantInteger32. 791 inline uint64_t getConstantMemoryOrder(Operand *Opnd) { 792 if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd)) 793 return Integer->getValue(); 794 return Intrinsics::MemoryOrderInvalid; 795 } 796 797 /// Determines whether the dest of a Load instruction can be folded into one of 798 /// the src operands of a 2-operand instruction. This is true as long as the 799 /// load dest matches exactly one of the binary instruction's src operands. 800 /// Replaces Src0 or Src1 with LoadSrc if the answer is true. 801 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest, 802 Operand *&Src0, Operand *&Src1) { 803 if (Src0 == LoadDest && Src1 != LoadDest) { 804 Src0 = LoadSrc; 805 return true; 806 } 807 if (Src0 != LoadDest && Src1 == LoadDest) { 808 Src1 = LoadSrc; 809 return true; 810 } 811 return false; 812 } 813 814 template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() { 815 TimerMarker _(TimerStack::TT_loadOpt, Func); 816 for (CfgNode *Node : Func->getNodes()) { 817 Context.init(Node); 818 while (!Context.atEnd()) { 819 Variable *LoadDest = nullptr; 820 Operand *LoadSrc = nullptr; 821 Inst *CurInst = iteratorToInst(Context.getCur()); 822 Inst *Next = Context.getNextInst(); 823 // Determine whether the current instruction is a Load instruction or 824 // equivalent. 825 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) { 826 // An InstLoad qualifies unless it uses a 64-bit absolute address, 827 // which requires legalization to insert a copy to register. 828 // TODO(b/148272103): Fold these after legalization. 829 if (!Traits::Is64Bit || !llvm::isa<Constant>(Load->getLoadAddress())) { 830 LoadDest = Load->getDest(); 831 constexpr bool DoLegalize = false; 832 LoadSrc = formMemoryOperand(Load->getLoadAddress(), 833 LoadDest->getType(), DoLegalize); 834 } 835 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) { 836 // An AtomicLoad intrinsic qualifies as long as it has a valid memory 837 // ordering, and can be implemented in a single instruction (i.e., not 838 // i64 on x86-32). 839 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID(); 840 if (ID == Intrinsics::AtomicLoad && 841 (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) && 842 Intrinsics::isMemoryOrderValid( 843 ID, getConstantMemoryOrder(Intrin->getArg(1)))) { 844 LoadDest = Intrin->getDest(); 845 constexpr bool DoLegalize = false; 846 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(), 847 DoLegalize); 848 } 849 } 850 // A Load instruction can be folded into the following instruction only 851 // if the following instruction ends the Load's Dest variable's live 852 // range. 853 if (LoadDest && Next && Next->isLastUse(LoadDest)) { 854 assert(LoadSrc); 855 Inst *NewInst = nullptr; 856 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) { 857 Operand *Src0 = Arith->getSrc(0); 858 Operand *Src1 = Arith->getSrc(1); 859 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { 860 NewInst = InstArithmetic::create(Func, Arith->getOp(), 861 Arith->getDest(), Src0, Src1); 862 } 863 } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) { 864 Operand *Src0 = Icmp->getSrc(0); 865 Operand *Src1 = Icmp->getSrc(1); 866 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { 867 NewInst = InstIcmp::create(Func, Icmp->getCondition(), 868 Icmp->getDest(), Src0, Src1); 869 } 870 } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) { 871 Operand *Src0 = Fcmp->getSrc(0); 872 Operand *Src1 = Fcmp->getSrc(1); 873 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { 874 NewInst = InstFcmp::create(Func, Fcmp->getCondition(), 875 Fcmp->getDest(), Src0, Src1); 876 } 877 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) { 878 Operand *Src0 = Select->getTrueOperand(); 879 Operand *Src1 = Select->getFalseOperand(); 880 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { 881 NewInst = InstSelect::create(Func, Select->getDest(), 882 Select->getCondition(), Src0, Src1); 883 } 884 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) { 885 // The load dest can always be folded into a Cast instruction. 886 auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0)); 887 if (Src0 == LoadDest) { 888 NewInst = InstCast::create(Func, Cast->getCastKind(), 889 Cast->getDest(), LoadSrc); 890 } 891 } 892 if (NewInst) { 893 CurInst->setDeleted(); 894 Next->setDeleted(); 895 Context.insert(NewInst); 896 // Update NewInst->LiveRangesEnded so that target lowering may 897 // benefit. Also update NewInst->HasSideEffects. 898 NewInst->spliceLivenessInfo(Next, CurInst); 899 } 900 } 901 Context.advanceCur(); 902 Context.advanceNext(); 903 } 904 } 905 Func->dump("After load optimization"); 906 } 907 908 template <typename TraitsType> 909 bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) { 910 if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) { 911 return Br->optimizeBranch(NextNode); 912 } 913 return false; 914 } 915 916 template <typename TraitsType> 917 Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum, 918 Type Ty) { 919 if (Ty == IceType_void) 920 Ty = IceType_i32; 921 if (PhysicalRegisters[Ty].empty()) 922 PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM); 923 assert(unsigned(RegNum) < PhysicalRegisters[Ty].size()); 924 Variable *Reg = PhysicalRegisters[Ty][RegNum]; 925 if (Reg == nullptr) { 926 Reg = Func->makeVariable(Ty); 927 Reg->setRegNum(RegNum); 928 PhysicalRegisters[Ty][RegNum] = Reg; 929 // Specially mark a named physical register as an "argument" so that it is 930 // considered live upon function entry. Otherwise it's possible to get 931 // liveness validation errors for saving callee-save registers. 932 Func->addImplicitArg(Reg); 933 // Don't bother tracking the live range of a named physical register. 934 Reg->setIgnoreLiveness(); 935 } 936 assert(Traits::getGprForType(Ty, RegNum) == RegNum); 937 return Reg; 938 } 939 940 template <typename TraitsType> 941 const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum, 942 Type Ty) const { 943 return Traits::getRegName(Traits::getGprForType(Ty, RegNum)); 944 } 945 946 template <typename TraitsType> 947 void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const { 948 if (!BuildDefs::dump()) 949 return; 950 Ostream &Str = Ctx->getStrEmit(); 951 if (Var->hasReg()) { 952 const bool Is64BitSandboxing = Traits::Is64Bit && NeedSandboxing; 953 const Type VarType = (Var->isRematerializable() && Is64BitSandboxing) 954 ? IceType_i64 955 : Var->getType(); 956 Str << "%" << getRegName(Var->getRegNum(), VarType); 957 return; 958 } 959 if (Var->mustHaveReg()) { 960 llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() + 961 ") has no register assigned - function " + 962 Func->getFunctionName()); 963 } 964 const int32_t Offset = Var->getStackOffset(); 965 auto BaseRegNum = Var->getBaseRegNum(); 966 if (BaseRegNum.hasNoValue()) 967 BaseRegNum = getFrameOrStackReg(); 968 969 // Print in the form "Offset(%reg)", omitting Offset when it is 0. 970 if (getFlags().getDecorateAsm()) { 971 Str << Var->getSymbolicStackOffset(); 972 } else if (Offset != 0) { 973 Str << Offset; 974 } 975 const Type FrameSPTy = Traits::WordType; 976 Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")"; 977 } 978 979 template <typename TraitsType> 980 typename TargetX86Base<TraitsType>::X86Address 981 TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const { 982 if (Var->hasReg()) 983 llvm::report_fatal_error("Stack Variable has a register assigned"); 984 if (Var->mustHaveReg()) { 985 llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() + 986 ") has no register assigned - function " + 987 Func->getFunctionName()); 988 } 989 int32_t Offset = Var->getStackOffset(); 990 auto BaseRegNum = Var->getBaseRegNum(); 991 if (Var->getBaseRegNum().hasNoValue()) { 992 // If the stack pointer needs alignment, we must use the frame pointer for 993 // arguments. For locals, getFrameOrStackReg will return the stack pointer 994 // in this case. 995 if (needsStackPointerAlignment() && Var->getIsArg()) { 996 assert(hasFramePointer()); 997 BaseRegNum = getFrameReg(); 998 } else { 999 BaseRegNum = getFrameOrStackReg(); 1000 } 1001 } 1002 return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset, 1003 AssemblerFixup::NoFixup); 1004 } 1005 1006 template <typename TraitsType> 1007 void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { 1008 // Stack frame layout: 1009 // 1010 // +------------------------+ ^ + 1011 // | 1. return address | | 1012 // +------------------------+ v - 1013 // | 2. preserved registers | 1014 // +------------------------+ <--- BasePointer (if used) 1015 // | 3. padding | 1016 // +------------------------+ 1017 // | 4. global spill area | 1018 // +------------------------+ 1019 // | 5. padding | 1020 // +------------------------+ 1021 // | 6. local spill area | 1022 // +------------------------+ 1023 // | 7. padding | 1024 // +------------------------+ 1025 // | 7.5 shadow (WinX64) | 1026 // +------------------------+ 1027 // | 8. allocas | 1028 // +------------------------+ 1029 // | 9. padding | 1030 // +------------------------+ 1031 // | 10. out args | 1032 // +------------------------+ <--- StackPointer 1033 // 1034 // The following variables record the size in bytes of the given areas: 1035 // * X86_RET_IP_SIZE_BYTES: area 1 1036 // * PreservedRegsSizeBytes: area 2 1037 // * SpillAreaPaddingBytes: area 3 1038 // * GlobalsSize: area 4 1039 // * LocalsSlotsPaddingBytes: area 5 1040 // * GlobalsAndSubsequentPaddingSize: areas 4 - 5 1041 // * LocalsSpillAreaSize: area 6 1042 // * FixedAllocaSizeBytes: areas 7 - 8 1043 // * SpillAreaSizeBytes: areas 3 - 10 1044 // * maxOutArgsSizeBytes(): areas 9 - 10 1045 1046 // Determine stack frame offsets for each Variable without a register 1047 // assignment. This can be done as one variable per stack slot. Or, do 1048 // coalescing by running the register allocator again with an infinite set of 1049 // registers (as a side effect, this gives variables a second chance at 1050 // physical register assignment). 1051 // 1052 // A middle ground approach is to leverage sparsity and allocate one block of 1053 // space on the frame for globals (variables with multi-block lifetime), and 1054 // one block to share for locals (single-block lifetime). 1055 1056 const SizeT ShadowStoreSize = getShadowStoreSize<Traits>(); 1057 1058 // StackPointer: points just past return address of calling function 1059 1060 Context.init(Node); 1061 Context.setInsertPoint(Context.getCur()); 1062 1063 SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None); 1064 RegsUsed = SmallBitVector(CalleeSaves.size()); 1065 VarList SortedSpilledVariables, VariablesLinkedToSpillSlots; 1066 size_t GlobalsSize = 0; 1067 // If there is a separate locals area, this represents that area. Otherwise 1068 // it counts any variable not counted by GlobalsSize. 1069 SpillAreaSizeBytes = 0; 1070 // If there is a separate locals area, this specifies the alignment for it. 1071 uint32_t LocalsSlotsAlignmentBytes = 0; 1072 // The entire spill locations area gets aligned to largest natural alignment 1073 // of the variables that have a spill slot. 1074 uint32_t SpillAreaAlignmentBytes = 0; 1075 // A spill slot linked to a variable with a stack slot should reuse that 1076 // stack slot. 1077 std::function<bool(Variable *)> TargetVarHook = 1078 [&VariablesLinkedToSpillSlots](Variable *Var) { 1079 // TODO(stichnot): Refactor this into the base class. 1080 Variable *Root = Var->getLinkedToStackRoot(); 1081 if (Root != nullptr) { 1082 assert(!Root->hasReg()); 1083 if (!Root->hasReg()) { 1084 VariablesLinkedToSpillSlots.push_back(Var); 1085 return true; 1086 } 1087 } 1088 return false; 1089 }; 1090 1091 // Compute the list of spilled variables and bounds for GlobalsSize, etc. 1092 getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize, 1093 &SpillAreaSizeBytes, &SpillAreaAlignmentBytes, 1094 &LocalsSlotsAlignmentBytes, TargetVarHook); 1095 uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes; 1096 SpillAreaSizeBytes += GlobalsSize; 1097 1098 // Add push instructions for preserved registers. 1099 uint32_t NumCallee = 0; 1100 size_t PreservedRegsSizeBytes = 0; 1101 SmallBitVector Pushed(CalleeSaves.size()); 1102 for (RegNumT i : RegNumBVIter(CalleeSaves)) { 1103 const auto Canonical = Traits::getBaseReg(i); 1104 assert(Canonical == Traits::getBaseReg(Canonical)); 1105 if (RegsUsed[i]) { 1106 Pushed[Canonical] = true; 1107 } 1108 } 1109 for (RegNumT RegNum : RegNumBVIter(Pushed)) { 1110 assert(RegNum == Traits::getBaseReg(RegNum)); 1111 ++NumCallee; 1112 if (Traits::isXmm(RegNum)) { 1113 PreservedRegsSizeBytes += 16; 1114 } else { 1115 PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType); 1116 } 1117 _push_reg(RegNum); 1118 } 1119 Ctx->statsUpdateRegistersSaved(NumCallee); 1120 1121 // StackPointer: points past preserved registers at start of spill area 1122 1123 // Generate "push frameptr; mov frameptr, stackptr" 1124 if (IsEbpBasedFrame) { 1125 assert( 1126 (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() == 1127 0); 1128 PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType); 1129 _link_bp(); 1130 } 1131 1132 // Align the variables area. SpillAreaPaddingBytes is the size of the region 1133 // after the preserved registers and before the spill areas. 1134 // LocalsSlotsPaddingBytes is the amount of padding between the globals and 1135 // locals area if they are separate. 1136 assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes); 1137 uint32_t SpillAreaPaddingBytes = 0; 1138 uint32_t LocalsSlotsPaddingBytes = 0; 1139 alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes, 1140 SpillAreaAlignmentBytes, GlobalsSize, 1141 LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes, 1142 &LocalsSlotsPaddingBytes); 1143 SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes; 1144 uint32_t GlobalsAndSubsequentPaddingSize = 1145 GlobalsSize + LocalsSlotsPaddingBytes; 1146 1147 // Functions returning scalar floating point types may need to convert values 1148 // from an in-register xmm value to the top of the x87 floating point stack. 1149 // This is done by a movp[sd] and an fld[sd]. Ensure there is enough scratch 1150 // space on the stack for this. 1151 const Type ReturnType = Func->getReturnType(); 1152 if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { 1153 if (isScalarFloatingType(ReturnType)) { 1154 // Avoid misaligned double-precision load/store. 1155 RequiredStackAlignment = std::max<size_t>( 1156 RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES); 1157 SpillAreaSizeBytes = 1158 std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes); 1159 } 1160 } 1161 1162 RequiredStackAlignment = 1163 std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes); 1164 1165 if (PrologEmitsFixedAllocas) { 1166 RequiredStackAlignment = 1167 std::max(RequiredStackAlignment, FixedAllocaAlignBytes); 1168 } 1169 1170 // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the 1171 // fixed allocations in the prolog. 1172 if (PrologEmitsFixedAllocas) 1173 SpillAreaSizeBytes += FixedAllocaSizeBytes; 1174 1175 // Win64 ABI: add space for shadow store (aka home space) 1176 SpillAreaSizeBytes += ShadowStoreSize; 1177 1178 // Entering the function has made the stack pointer unaligned. Re-align it by 1179 // adjusting the stack size. 1180 // Note that StackOffset does not include spill area. It's the offset from the 1181 // base stack pointer (epb), whether we set it or not, to the the first stack 1182 // arg (if any). StackSize, on the other hand, does include the spill area. 1183 const uint32_t StackOffset = 1184 ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes; 1185 uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes, 1186 RequiredStackAlignment); 1187 StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(), 1188 RequiredStackAlignment); 1189 SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any 1190 1191 if (SpillAreaSizeBytes) { 1192 auto *Func = Node->getCfg(); 1193 if (SpillAreaSizeBytes > Func->getStackSizeLimit()) { 1194 Func->setError("Stack size limit exceeded"); 1195 } 1196 1197 emitStackProbe(SpillAreaSizeBytes); 1198 1199 // Generate "sub stackptr, SpillAreaSizeBytes" 1200 _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes)); 1201 } 1202 1203 // StackPointer: points just past the spill area (end of stack frame) 1204 1205 // If the required alignment is greater than the stack pointer's guaranteed 1206 // alignment, align the stack pointer accordingly. 1207 if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) { 1208 assert(IsEbpBasedFrame); 1209 _and(getPhysicalRegister(getStackReg(), Traits::WordType), 1210 Ctx->getConstantInt32(-RequiredStackAlignment)); 1211 } 1212 1213 // StackPointer: may have just been offset for alignment 1214 1215 // Account for known-frame-offset alloca instructions that were not already 1216 // combined into the prolog. 1217 if (!PrologEmitsFixedAllocas) 1218 SpillAreaSizeBytes += FixedAllocaSizeBytes; 1219 1220 Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes); 1221 1222 // Fill in stack offsets for stack args, and copy args into registers for 1223 // those that were register-allocated. Args are pushed right to left, so 1224 // Arg[0] is closest to the stack/frame pointer. 1225 RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg(); 1226 Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType); 1227 size_t BasicFrameOffset = StackOffset; 1228 if (!IsEbpBasedFrame) 1229 BasicFrameOffset += SpillAreaSizeBytes; 1230 1231 emitGetIP(Node); 1232 1233 const VarList &Args = Func->getArgs(); 1234 size_t InArgsSizeBytes = 0; 1235 unsigned NumXmmArgs = 0; 1236 unsigned NumGPRArgs = 0; 1237 for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) { 1238 Variable *Arg = Args[i]; 1239 // Skip arguments passed in registers. 1240 if (isVectorType(Arg->getType())) { 1241 if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs)) 1242 .hasValue()) { 1243 ++NumXmmArgs; 1244 continue; 1245 } 1246 } else if (isScalarFloatingType(Arg->getType())) { 1247 if (Traits::X86_PASS_SCALAR_FP_IN_XMM && 1248 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs)) 1249 .hasValue()) { 1250 ++NumXmmArgs; 1251 continue; 1252 } 1253 } else { 1254 assert(isScalarIntegerType(Arg->getType())); 1255 if (Traits::getRegisterForGprArgNum(Traits::WordType, 1256 Traits::getArgIndex(i, NumGPRArgs)) 1257 .hasValue()) { 1258 ++NumGPRArgs; 1259 continue; 1260 } 1261 } 1262 // For esp-based frames where the allocas are done outside the prolog, the 1263 // esp value may not stabilize to its home value until after all the 1264 // fixed-size alloca instructions have executed. In this case, a stack 1265 // adjustment is needed when accessing in-args in order to copy them into 1266 // registers. 1267 size_t StackAdjBytes = 0; 1268 if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas) 1269 StackAdjBytes -= FixedAllocaSizeBytes; 1270 finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes, 1271 InArgsSizeBytes); 1272 } 1273 1274 // Fill in stack offsets for locals. 1275 assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes, 1276 SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize, 1277 IsEbpBasedFrame && !needsStackPointerAlignment()); 1278 // Assign stack offsets to variables that have been linked to spilled 1279 // variables. 1280 for (Variable *Var : VariablesLinkedToSpillSlots) { 1281 const Variable *Root = Var->getLinkedToStackRoot(); 1282 assert(Root != nullptr); 1283 Var->setStackOffset(Root->getStackOffset()); 1284 1285 // If the stack root variable is an arg, make this variable an arg too so 1286 // that stackVarToAsmOperand uses the correct base pointer (e.g. ebp on 1287 // x86). 1288 Var->setIsArg(Root->getIsArg()); 1289 } 1290 this->HasComputedFrame = true; 1291 1292 if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) { 1293 OstreamLocker L(Func->getContext()); 1294 Ostream &Str = Func->getContext()->getStrDump(); 1295 1296 Str << "Stack layout:\n"; 1297 uint32_t EspAdjustmentPaddingSize = 1298 SpillAreaSizeBytes - LocalsSpillAreaSize - 1299 GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes - 1300 maxOutArgsSizeBytes(); 1301 Str << " in-args = " << InArgsSizeBytes << " bytes\n" 1302 << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n" 1303 << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n" 1304 << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n" 1305 << " globals spill area = " << GlobalsSize << " bytes\n" 1306 << " globals-locals spill areas intermediate padding = " 1307 << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n" 1308 << " locals spill area = " << LocalsSpillAreaSize << " bytes\n" 1309 << " esp alignment padding = " << EspAdjustmentPaddingSize 1310 << " bytes\n"; 1311 1312 Str << "Stack details:\n" 1313 << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n" 1314 << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n" 1315 << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n" 1316 << " locals spill area alignment = " << LocalsSlotsAlignmentBytes 1317 << " bytes\n" 1318 << " is ebp based = " << IsEbpBasedFrame << "\n"; 1319 } 1320 } 1321 1322 /// Helper function for addProlog(). 1323 /// 1324 /// This assumes Arg is an argument passed on the stack. This sets the frame 1325 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an 1326 /// I64 arg that has been split into Lo and Hi components, it calls itself 1327 /// recursively on the components, taking care to handle Lo first because of the 1328 /// little-endian architecture. Lastly, this function generates an instruction 1329 /// to copy Arg into its assigned register if applicable. 1330 template <typename TraitsType> 1331 void TargetX86Base<TraitsType>::finishArgumentLowering( 1332 Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset, 1333 size_t StackAdjBytes, size_t &InArgsSizeBytes) { 1334 if (!Traits::Is64Bit) { 1335 if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) { 1336 Variable *Lo = Arg64On32->getLo(); 1337 Variable *Hi = Arg64On32->getHi(); 1338 finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes, 1339 InArgsSizeBytes); 1340 finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes, 1341 InArgsSizeBytes); 1342 return; 1343 } 1344 } 1345 Type Ty = Arg->getType(); 1346 if (isVectorType(Ty)) { 1347 InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes); 1348 } 1349 Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes); 1350 InArgsSizeBytes += typeWidthInBytesOnStack(Ty); 1351 if (Arg->hasReg()) { 1352 assert(Ty != IceType_i64 || Traits::Is64Bit); 1353 auto *Mem = X86OperandMem::create( 1354 Func, Ty, FramePtr, 1355 Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes)); 1356 if (isVectorType(Arg->getType())) { 1357 _movp(Arg, Mem); 1358 } else { 1359 _mov(Arg, Mem); 1360 } 1361 // This argument-copying instruction uses an explicit X86OperandMem 1362 // operand instead of a Variable, so its fill-from-stack operation has to 1363 // be tracked separately for statistics. 1364 Ctx->statsUpdateFills(); 1365 } 1366 } 1367 1368 template <typename TraitsType> 1369 void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) { 1370 InstList &Insts = Node->getInsts(); 1371 InstList::reverse_iterator RI, E; 1372 for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) { 1373 if (llvm::isa<typename Traits::Insts::Ret>(*RI)) 1374 break; 1375 } 1376 if (RI == E) 1377 return; 1378 1379 // Convert the reverse_iterator position into its corresponding (forward) 1380 // iterator position. 1381 InstList::iterator InsertPoint = reverseToForwardIterator(RI); 1382 --InsertPoint; 1383 Context.init(Node); 1384 Context.setInsertPoint(InsertPoint); 1385 1386 if (IsEbpBasedFrame) { 1387 _unlink_bp(); 1388 } else { 1389 // add stackptr, SpillAreaSizeBytes 1390 if (SpillAreaSizeBytes != 0) { 1391 _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes)); 1392 } 1393 } 1394 1395 // Add pop instructions for preserved registers. 1396 SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None); 1397 SmallBitVector Popped(CalleeSaves.size()); 1398 for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) { 1399 const auto RegNum = RegNumT::fromInt(i); 1400 if (RegNum == getFrameReg() && IsEbpBasedFrame) 1401 continue; 1402 const RegNumT Canonical = Traits::getBaseReg(RegNum); 1403 if (CalleeSaves[i] && RegsUsed[i]) { 1404 Popped[Canonical] = true; 1405 } 1406 } 1407 for (int32_t i = Popped.size() - 1; i >= 0; --i) { 1408 if (!Popped[i]) 1409 continue; 1410 const auto RegNum = RegNumT::fromInt(i); 1411 assert(RegNum == Traits::getBaseReg(RegNum)); 1412 _pop_reg(RegNum); 1413 } 1414 1415 if (!NeedSandboxing) { 1416 return; 1417 } 1418 emitSandboxedReturn(); 1419 if (RI->getSrcSize()) { 1420 auto *RetValue = llvm::cast<Variable>(RI->getSrc(0)); 1421 Context.insert<InstFakeUse>(RetValue); 1422 } 1423 RI->setDeleted(); 1424 } 1425 1426 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() { 1427 return Traits::WordType; 1428 } 1429 1430 template <typename TraitsType> 1431 template <typename T> 1432 typename std::enable_if<!T::Is64Bit, Operand>::type * 1433 TargetX86Base<TraitsType>::loOperand(Operand *Operand) { 1434 assert(Operand->getType() == IceType_i64 || 1435 Operand->getType() == IceType_f64); 1436 if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64) 1437 return Operand; 1438 if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand)) 1439 return Var64On32->getLo(); 1440 if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) { 1441 auto *ConstInt = llvm::dyn_cast<ConstantInteger32>( 1442 Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue()))); 1443 // Check if we need to blind/pool the constant. 1444 return legalize(ConstInt); 1445 } 1446 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) { 1447 auto *MemOperand = X86OperandMem::create( 1448 Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(), 1449 Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased()); 1450 // Test if we should randomize or pool the offset, if so randomize it or 1451 // pool it then create mem operand with the blinded/pooled constant. 1452 // Otherwise, return the mem operand as ordinary mem operand. 1453 return legalize(MemOperand); 1454 } 1455 llvm_unreachable("Unsupported operand type"); 1456 return nullptr; 1457 } 1458 1459 template <typename TraitsType> 1460 template <typename T> 1461 typename std::enable_if<!T::Is64Bit, Operand>::type * 1462 TargetX86Base<TraitsType>::hiOperand(Operand *Operand) { 1463 assert(Operand->getType() == IceType_i64 || 1464 Operand->getType() == IceType_f64); 1465 if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64) 1466 return Operand; 1467 if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand)) 1468 return Var64On32->getHi(); 1469 if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) { 1470 auto *ConstInt = llvm::dyn_cast<ConstantInteger32>( 1471 Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32))); 1472 // Check if we need to blind/pool the constant. 1473 return legalize(ConstInt); 1474 } 1475 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) { 1476 Constant *Offset = Mem->getOffset(); 1477 if (Offset == nullptr) { 1478 Offset = Ctx->getConstantInt32(4); 1479 } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) { 1480 Offset = Ctx->getConstantInt32(4 + IntOffset->getValue()); 1481 } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) { 1482 assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4)); 1483 Offset = 1484 Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName()); 1485 } 1486 auto *MemOperand = X86OperandMem::create( 1487 Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(), 1488 Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased()); 1489 // Test if the Offset is an eligible i32 constants for randomization and 1490 // pooling. Blind/pool it if it is. Otherwise return as oridinary mem 1491 // operand. 1492 return legalize(MemOperand); 1493 } 1494 llvm_unreachable("Unsupported operand type"); 1495 return nullptr; 1496 } 1497 1498 template <typename TraitsType> 1499 SmallBitVector 1500 TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include, 1501 RegSetMask Exclude) const { 1502 return Traits::getRegisterSet(getFlags(), Include, Exclude); 1503 } 1504 1505 template <typename TraitsType> 1506 void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) { 1507 // Conservatively require the stack to be aligned. Some stack adjustment 1508 // operations implemented below assume that the stack is aligned before the 1509 // alloca. All the alloca code ensures that the stack alignment is preserved 1510 // after the alloca. The stack alignment restriction can be relaxed in some 1511 // cases. 1512 RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment, 1513 Traits::X86_STACK_ALIGNMENT_BYTES); 1514 1515 // For default align=0, set it to the real value 1, to avoid any 1516 // bit-manipulation problems below. 1517 const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes()); 1518 1519 // LLVM enforces power of 2 alignment. 1520 assert(llvm::isPowerOf2_32(AlignmentParam)); 1521 assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES)); 1522 1523 const uint32_t Alignment = 1524 std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES); 1525 const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES; 1526 const bool OptM1 = Func->getOptLevel() == Opt_m1; 1527 const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset(); 1528 const bool UseFramePointer = 1529 hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1; 1530 1531 if (UseFramePointer) 1532 setHasFramePointer(); 1533 1534 Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType); 1535 if (OverAligned) { 1536 _and(esp, Ctx->getConstantInt32(-Alignment)); 1537 } 1538 1539 Variable *Dest = Instr->getDest(); 1540 Operand *TotalSize = legalize(Instr->getSizeInBytes()); 1541 1542 if (const auto *ConstantTotalSize = 1543 llvm::dyn_cast<ConstantInteger32>(TotalSize)) { 1544 const uint32_t Value = 1545 Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment); 1546 if (UseFramePointer) { 1547 _sub_sp(Ctx->getConstantInt32(Value)); 1548 } else { 1549 // If we don't need a Frame Pointer, this alloca has a known offset to the 1550 // stack pointer. We don't need adjust the stack pointer, nor assign any 1551 // value to Dest, as Dest is rematerializable. 1552 assert(Dest->isRematerializable()); 1553 FixedAllocaSizeBytes += Value; 1554 Context.insert<InstFakeDef>(Dest); 1555 } 1556 } else { 1557 // Non-constant sizes need to be adjusted to the next highest multiple of 1558 // the required alignment at runtime. 1559 Variable *T = nullptr; 1560 if (Traits::Is64Bit && TotalSize->getType() != IceType_i64 && 1561 !NeedSandboxing) { 1562 T = makeReg(IceType_i64); 1563 _movzx(T, TotalSize); 1564 } else { 1565 T = makeReg(IceType_i32); 1566 _mov(T, TotalSize); 1567 } 1568 _add(T, Ctx->getConstantInt32(Alignment - 1)); 1569 _and(T, Ctx->getConstantInt32(-Alignment)); 1570 _sub_sp(T); 1571 } 1572 // Add enough to the returned address to account for the out args area. 1573 uint32_t OutArgsSize = maxOutArgsSizeBytes(); 1574 if (OutArgsSize > 0) { 1575 Variable *T = makeReg(Dest->getType()); 1576 auto *CalculateOperand = X86OperandMem::create( 1577 Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize)); 1578 _lea(T, CalculateOperand); 1579 _mov(Dest, T); 1580 } else { 1581 _mov(Dest, esp); 1582 } 1583 } 1584 1585 template <typename TraitsType> 1586 void TargetX86Base<TraitsType>::lowerArguments() { 1587 const bool OptM1 = Func->getOptLevel() == Opt_m1; 1588 VarList &Args = Func->getArgs(); 1589 unsigned NumXmmArgs = 0; 1590 bool XmmSlotsRemain = true; 1591 unsigned NumGprArgs = 0; 1592 bool GprSlotsRemain = true; 1593 1594 Context.init(Func->getEntryNode()); 1595 Context.setInsertPoint(Context.getCur()); 1596 1597 for (SizeT i = 0, End = Args.size(); 1598 i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) { 1599 Variable *Arg = Args[i]; 1600 Type Ty = Arg->getType(); 1601 Variable *RegisterArg = nullptr; 1602 RegNumT RegNum; 1603 if (isVectorType(Ty)) { 1604 RegNum = 1605 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs)); 1606 if (RegNum.hasNoValue()) { 1607 XmmSlotsRemain = false; 1608 continue; 1609 } 1610 ++NumXmmArgs; 1611 RegisterArg = Func->makeVariable(Ty); 1612 } else if (isScalarFloatingType(Ty)) { 1613 if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { 1614 continue; 1615 } 1616 RegNum = 1617 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs)); 1618 if (RegNum.hasNoValue()) { 1619 XmmSlotsRemain = false; 1620 continue; 1621 } 1622 ++NumXmmArgs; 1623 RegisterArg = Func->makeVariable(Ty); 1624 } else if (isScalarIntegerType(Ty)) { 1625 RegNum = Traits::getRegisterForGprArgNum( 1626 Ty, Traits::getArgIndex(i, NumGprArgs)); 1627 if (RegNum.hasNoValue()) { 1628 GprSlotsRemain = false; 1629 continue; 1630 } 1631 ++NumGprArgs; 1632 RegisterArg = Func->makeVariable(Ty); 1633 } 1634 assert(RegNum.hasValue()); 1635 assert(RegisterArg != nullptr); 1636 // Replace Arg in the argument list with the home register. Then generate 1637 // an instruction in the prolog to copy the home register to the assigned 1638 // location of Arg. 1639 if (BuildDefs::dump()) 1640 RegisterArg->setName(Func, "home_reg:" + Arg->getName()); 1641 RegisterArg->setRegNum(RegNum); 1642 RegisterArg->setIsArg(); 1643 Arg->setIsArg(false); 1644 1645 Args[i] = RegisterArg; 1646 // When not Om1, do the assignment through a temporary, instead of directly 1647 // from the pre-colored variable, so that a subsequent availabilityGet() 1648 // call has a chance to work. (In Om1, don't bother creating extra 1649 // instructions with extra variables to register-allocate.) 1650 if (OptM1) { 1651 Context.insert<InstAssign>(Arg, RegisterArg); 1652 } else { 1653 Variable *Tmp = makeReg(RegisterArg->getType()); 1654 Context.insert<InstAssign>(Tmp, RegisterArg); 1655 Context.insert<InstAssign>(Arg, Tmp); 1656 } 1657 } 1658 if (!OptM1) 1659 Context.availabilityUpdate(); 1660 } 1661 1662 /// Strength-reduce scalar integer multiplication by a constant (for i32 or 1663 /// narrower) for certain constants. The lea instruction can be used to multiply 1664 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of 1665 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2 1666 /// lea-based multiplies by 5, combined with left-shifting by 2. 1667 template <typename TraitsType> 1668 bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0, 1669 int32_t Src1) { 1670 // Disable this optimization for Om1 and O0, just to keep things simple 1671 // there. 1672 if (Func->getOptLevel() < Opt_1) 1673 return false; 1674 Type Ty = Dest->getType(); 1675 if (Src1 == -1) { 1676 Variable *T = nullptr; 1677 _mov(T, Src0); 1678 _neg(T); 1679 _mov(Dest, T); 1680 return true; 1681 } 1682 if (Src1 == 0) { 1683 _mov(Dest, Ctx->getConstantZero(Ty)); 1684 return true; 1685 } 1686 if (Src1 == 1) { 1687 Variable *T = nullptr; 1688 _mov(T, Src0); 1689 _mov(Dest, T); 1690 return true; 1691 } 1692 // Don't bother with the edge case where Src1 == MININT. 1693 if (Src1 == -Src1) 1694 return false; 1695 const bool Src1IsNegative = Src1 < 0; 1696 if (Src1IsNegative) 1697 Src1 = -Src1; 1698 uint32_t Count9 = 0; 1699 uint32_t Count5 = 0; 1700 uint32_t Count3 = 0; 1701 uint32_t Count2 = 0; 1702 uint32_t CountOps = 0; 1703 while (Src1 > 1) { 1704 if (Src1 % 9 == 0) { 1705 ++CountOps; 1706 ++Count9; 1707 Src1 /= 9; 1708 } else if (Src1 % 5 == 0) { 1709 ++CountOps; 1710 ++Count5; 1711 Src1 /= 5; 1712 } else if (Src1 % 3 == 0) { 1713 ++CountOps; 1714 ++Count3; 1715 Src1 /= 3; 1716 } else if (Src1 % 2 == 0) { 1717 if (Count2 == 0) 1718 ++CountOps; 1719 ++Count2; 1720 Src1 /= 2; 1721 } else { 1722 return false; 1723 } 1724 } 1725 // Lea optimization only works for i16 and i32 types, not i8. 1726 if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) && 1727 (Count3 || Count5 || Count9)) 1728 return false; 1729 // Limit the number of lea/shl operations for a single multiply, to a 1730 // somewhat arbitrary choice of 3. 1731 constexpr uint32_t MaxOpsForOptimizedMul = 3; 1732 if (CountOps > MaxOpsForOptimizedMul) 1733 return false; 1734 Variable *T = makeReg(Traits::WordType); 1735 if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) { 1736 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 1737 _movzx(T, Src0RM); 1738 } else { 1739 _mov(T, Src0); 1740 } 1741 Constant *Zero = Ctx->getConstantZero(IceType_i32); 1742 for (uint32_t i = 0; i < Count9; ++i) { 1743 constexpr uint16_t Shift = 3; // log2(9-1) 1744 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); 1745 } 1746 for (uint32_t i = 0; i < Count5; ++i) { 1747 constexpr uint16_t Shift = 2; // log2(5-1) 1748 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); 1749 } 1750 for (uint32_t i = 0; i < Count3; ++i) { 1751 constexpr uint16_t Shift = 1; // log2(3-1) 1752 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); 1753 } 1754 if (Count2) { 1755 _shl(T, Ctx->getConstantInt(Ty, Count2)); 1756 } 1757 if (Src1IsNegative) 1758 _neg(T); 1759 _mov(Dest, T); 1760 return true; 1761 } 1762 1763 template <typename TraitsType> 1764 void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op, 1765 Operand *Src0Lo, Operand *Src0Hi, 1766 Operand *Src1Lo, Variable *DestLo, 1767 Variable *DestHi) { 1768 // TODO: Refactor the similarities between Shl, Lshr, and Ashr. 1769 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr; 1770 Constant *Zero = Ctx->getConstantZero(IceType_i32); 1771 Constant *SignExtend = Ctx->getConstantInt32(0x1f); 1772 if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) { 1773 uint32_t ShiftAmount = ConstantShiftAmount->getValue(); 1774 if (ShiftAmount > 32) { 1775 Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32); 1776 switch (Op) { 1777 default: 1778 assert(0 && "non-shift op"); 1779 break; 1780 case InstArithmetic::Shl: { 1781 // a=b<<c ==> 1782 // t2 = b.lo 1783 // t2 = shl t2, ShiftAmount-32 1784 // t3 = t2 1785 // t2 = 0 1786 _mov(T_2, Src0Lo); 1787 _shl(T_2, ReducedShift); 1788 _mov(DestHi, T_2); 1789 _mov(DestLo, Zero); 1790 } break; 1791 case InstArithmetic::Lshr: { 1792 // a=b>>c (unsigned) ==> 1793 // t2 = b.hi 1794 // t2 = shr t2, ShiftAmount-32 1795 // a.lo = t2 1796 // a.hi = 0 1797 _mov(T_2, Src0Hi); 1798 _shr(T_2, ReducedShift); 1799 _mov(DestLo, T_2); 1800 _mov(DestHi, Zero); 1801 } break; 1802 case InstArithmetic::Ashr: { 1803 // a=b>>c (signed) ==> 1804 // t3 = b.hi 1805 // t3 = sar t3, 0x1f 1806 // t2 = b.hi 1807 // t2 = shrd t2, t3, ShiftAmount-32 1808 // a.lo = t2 1809 // a.hi = t3 1810 _mov(T_3, Src0Hi); 1811 _sar(T_3, SignExtend); 1812 _mov(T_2, Src0Hi); 1813 _shrd(T_2, T_3, ReducedShift); 1814 _mov(DestLo, T_2); 1815 _mov(DestHi, T_3); 1816 } break; 1817 } 1818 } else if (ShiftAmount == 32) { 1819 switch (Op) { 1820 default: 1821 assert(0 && "non-shift op"); 1822 break; 1823 case InstArithmetic::Shl: { 1824 // a=b<<c ==> 1825 // t2 = b.lo 1826 // a.hi = t2 1827 // a.lo = 0 1828 _mov(T_2, Src0Lo); 1829 _mov(DestHi, T_2); 1830 _mov(DestLo, Zero); 1831 } break; 1832 case InstArithmetic::Lshr: { 1833 // a=b>>c (unsigned) ==> 1834 // t2 = b.hi 1835 // a.lo = t2 1836 // a.hi = 0 1837 _mov(T_2, Src0Hi); 1838 _mov(DestLo, T_2); 1839 _mov(DestHi, Zero); 1840 } break; 1841 case InstArithmetic::Ashr: { 1842 // a=b>>c (signed) ==> 1843 // t2 = b.hi 1844 // a.lo = t2 1845 // t3 = b.hi 1846 // t3 = sar t3, 0x1f 1847 // a.hi = t3 1848 _mov(T_2, Src0Hi); 1849 _mov(DestLo, T_2); 1850 _mov(T_3, Src0Hi); 1851 _sar(T_3, SignExtend); 1852 _mov(DestHi, T_3); 1853 } break; 1854 } 1855 } else { 1856 // COMMON PREFIX OF: a=b SHIFT_OP c ==> 1857 // t2 = b.lo 1858 // t3 = b.hi 1859 _mov(T_2, Src0Lo); 1860 _mov(T_3, Src0Hi); 1861 switch (Op) { 1862 default: 1863 assert(0 && "non-shift op"); 1864 break; 1865 case InstArithmetic::Shl: { 1866 // a=b<<c ==> 1867 // t3 = shld t3, t2, ShiftAmount 1868 // t2 = shl t2, ShiftAmount 1869 _shld(T_3, T_2, ConstantShiftAmount); 1870 _shl(T_2, ConstantShiftAmount); 1871 } break; 1872 case InstArithmetic::Lshr: { 1873 // a=b>>c (unsigned) ==> 1874 // t2 = shrd t2, t3, ShiftAmount 1875 // t3 = shr t3, ShiftAmount 1876 _shrd(T_2, T_3, ConstantShiftAmount); 1877 _shr(T_3, ConstantShiftAmount); 1878 } break; 1879 case InstArithmetic::Ashr: { 1880 // a=b>>c (signed) ==> 1881 // t2 = shrd t2, t3, ShiftAmount 1882 // t3 = sar t3, ShiftAmount 1883 _shrd(T_2, T_3, ConstantShiftAmount); 1884 _sar(T_3, ConstantShiftAmount); 1885 } break; 1886 } 1887 // COMMON SUFFIX OF: a=b SHIFT_OP c ==> 1888 // a.lo = t2 1889 // a.hi = t3 1890 _mov(DestLo, T_2); 1891 _mov(DestHi, T_3); 1892 } 1893 } else { 1894 // NON-CONSTANT CASES. 1895 Constant *BitTest = Ctx->getConstantInt32(0x20); 1896 InstX86Label *Label = InstX86Label::create(Func, this); 1897 // COMMON PREFIX OF: a=b SHIFT_OP c ==> 1898 // t1:ecx = c.lo & 0xff 1899 // t2 = b.lo 1900 // t3 = b.hi 1901 T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl); 1902 _mov(T_2, Src0Lo); 1903 _mov(T_3, Src0Hi); 1904 switch (Op) { 1905 default: 1906 assert(0 && "non-shift op"); 1907 break; 1908 case InstArithmetic::Shl: { 1909 // a=b<<c ==> 1910 // t3 = shld t3, t2, t1 1911 // t2 = shl t2, t1 1912 // test t1, 0x20 1913 // je L1 1914 // use(t3) 1915 // t3 = t2 1916 // t2 = 0 1917 _shld(T_3, T_2, T_1); 1918 _shl(T_2, T_1); 1919 _test(T_1, BitTest); 1920 _br(Traits::Cond::Br_e, Label); 1921 // T_2 and T_3 are being assigned again because of the intra-block control 1922 // flow, so we need to use _redefined to avoid liveness problems. 1923 _redefined(_mov(T_3, T_2)); 1924 _redefined(_mov(T_2, Zero)); 1925 } break; 1926 case InstArithmetic::Lshr: { 1927 // a=b>>c (unsigned) ==> 1928 // t2 = shrd t2, t3, t1 1929 // t3 = shr t3, t1 1930 // test t1, 0x20 1931 // je L1 1932 // use(t2) 1933 // t2 = t3 1934 // t3 = 0 1935 _shrd(T_2, T_3, T_1); 1936 _shr(T_3, T_1); 1937 _test(T_1, BitTest); 1938 _br(Traits::Cond::Br_e, Label); 1939 // T_2 and T_3 are being assigned again because of the intra-block control 1940 // flow, so we need to use _redefined to avoid liveness problems. 1941 _redefined(_mov(T_2, T_3)); 1942 _redefined(_mov(T_3, Zero)); 1943 } break; 1944 case InstArithmetic::Ashr: { 1945 // a=b>>c (signed) ==> 1946 // t2 = shrd t2, t3, t1 1947 // t3 = sar t3, t1 1948 // test t1, 0x20 1949 // je L1 1950 // use(t2) 1951 // t2 = t3 1952 // t3 = sar t3, 0x1f 1953 Constant *SignExtend = Ctx->getConstantInt32(0x1f); 1954 _shrd(T_2, T_3, T_1); 1955 _sar(T_3, T_1); 1956 _test(T_1, BitTest); 1957 _br(Traits::Cond::Br_e, Label); 1958 // T_2 and T_3 are being assigned again because of the intra-block control 1959 // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3 1960 // doesn't need special treatment because it is reassigned via _sar 1961 // instead of _mov. 1962 _redefined(_mov(T_2, T_3)); 1963 _sar(T_3, SignExtend); 1964 } break; 1965 } 1966 // COMMON SUFFIX OF: a=b SHIFT_OP c ==> 1967 // L1: 1968 // a.lo = t2 1969 // a.hi = t3 1970 Context.insert(Label); 1971 _mov(DestLo, T_2); 1972 _mov(DestHi, T_3); 1973 } 1974 } 1975 1976 template <typename TraitsType> 1977 void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) { 1978 Variable *Dest = Instr->getDest(); 1979 if (Dest->isRematerializable()) { 1980 Context.insert<InstFakeDef>(Dest); 1981 return; 1982 } 1983 Type Ty = Dest->getType(); 1984 Operand *Src0 = legalize(Instr->getSrc(0)); 1985 Operand *Src1 = legalize(Instr->getSrc(1)); 1986 if (Instr->isCommutative()) { 1987 uint32_t SwapCount = 0; 1988 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) { 1989 std::swap(Src0, Src1); 1990 ++SwapCount; 1991 } 1992 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) { 1993 std::swap(Src0, Src1); 1994 ++SwapCount; 1995 } 1996 // Improve two-address code patterns by avoiding a copy to the dest 1997 // register when one of the source operands ends its lifetime here. 1998 if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) { 1999 std::swap(Src0, Src1); 2000 ++SwapCount; 2001 } 2002 assert(SwapCount <= 1); 2003 (void)SwapCount; 2004 } 2005 if (!Traits::Is64Bit && Ty == IceType_i64) { 2006 // These x86-32 helper-call-involved instructions are lowered in this 2007 // separate switch. This is because loOperand() and hiOperand() may insert 2008 // redundant instructions for constant blinding and pooling. Such redundant 2009 // instructions will fail liveness analysis under -Om1 setting. And, 2010 // actually these arguments do not need to be processed with loOperand() 2011 // and hiOperand() to be used. 2012 switch (Instr->getOp()) { 2013 case InstArithmetic::Udiv: 2014 case InstArithmetic::Sdiv: 2015 case InstArithmetic::Urem: 2016 case InstArithmetic::Srem: 2017 llvm::report_fatal_error("Helper call was expected"); 2018 return; 2019 default: 2020 break; 2021 } 2022 2023 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 2024 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 2025 Operand *Src0Lo = loOperand(Src0); 2026 Operand *Src0Hi = hiOperand(Src0); 2027 Operand *Src1Lo = loOperand(Src1); 2028 Operand *Src1Hi = hiOperand(Src1); 2029 Variable *T_Lo = nullptr, *T_Hi = nullptr; 2030 switch (Instr->getOp()) { 2031 case InstArithmetic::_num: 2032 llvm_unreachable("Unknown arithmetic operator"); 2033 break; 2034 case InstArithmetic::Add: 2035 _mov(T_Lo, Src0Lo); 2036 _add(T_Lo, Src1Lo); 2037 _mov(DestLo, T_Lo); 2038 _mov(T_Hi, Src0Hi); 2039 _adc(T_Hi, Src1Hi); 2040 _mov(DestHi, T_Hi); 2041 break; 2042 case InstArithmetic::And: 2043 _mov(T_Lo, Src0Lo); 2044 _and(T_Lo, Src1Lo); 2045 _mov(DestLo, T_Lo); 2046 _mov(T_Hi, Src0Hi); 2047 _and(T_Hi, Src1Hi); 2048 _mov(DestHi, T_Hi); 2049 break; 2050 case InstArithmetic::Or: 2051 _mov(T_Lo, Src0Lo); 2052 _or(T_Lo, Src1Lo); 2053 _mov(DestLo, T_Lo); 2054 _mov(T_Hi, Src0Hi); 2055 _or(T_Hi, Src1Hi); 2056 _mov(DestHi, T_Hi); 2057 break; 2058 case InstArithmetic::Xor: 2059 _mov(T_Lo, Src0Lo); 2060 _xor(T_Lo, Src1Lo); 2061 _mov(DestLo, T_Lo); 2062 _mov(T_Hi, Src0Hi); 2063 _xor(T_Hi, Src1Hi); 2064 _mov(DestHi, T_Hi); 2065 break; 2066 case InstArithmetic::Sub: 2067 _mov(T_Lo, Src0Lo); 2068 _sub(T_Lo, Src1Lo); 2069 _mov(DestLo, T_Lo); 2070 _mov(T_Hi, Src0Hi); 2071 _sbb(T_Hi, Src1Hi); 2072 _mov(DestHi, T_Hi); 2073 break; 2074 case InstArithmetic::Mul: { 2075 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr; 2076 Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax); 2077 Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx); 2078 // gcc does the following: 2079 // a=b*c ==> 2080 // t1 = b.hi; t1 *=(imul) c.lo 2081 // t2 = c.hi; t2 *=(imul) b.lo 2082 // t3:eax = b.lo 2083 // t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo 2084 // a.lo = t4.lo 2085 // t4.hi += t1 2086 // t4.hi += t2 2087 // a.hi = t4.hi 2088 // The mul instruction cannot take an immediate operand. 2089 Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem); 2090 _mov(T_1, Src0Hi); 2091 _imul(T_1, Src1Lo); 2092 _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax); 2093 _mul(T_4Lo, T_3, Src1Lo); 2094 // The mul instruction produces two dest variables, edx:eax. We create a 2095 // fake definition of edx to account for this. 2096 Context.insert<InstFakeDef>(T_4Hi, T_4Lo); 2097 Context.insert<InstFakeUse>(T_4Hi); 2098 _mov(DestLo, T_4Lo); 2099 _add(T_4Hi, T_1); 2100 _mov(T_2, Src1Hi); 2101 Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem); 2102 _imul(T_2, Src0Lo); 2103 _add(T_4Hi, T_2); 2104 _mov(DestHi, T_4Hi); 2105 } break; 2106 case InstArithmetic::Shl: 2107 case InstArithmetic::Lshr: 2108 case InstArithmetic::Ashr: 2109 lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi); 2110 break; 2111 case InstArithmetic::Fadd: 2112 case InstArithmetic::Fsub: 2113 case InstArithmetic::Fmul: 2114 case InstArithmetic::Fdiv: 2115 case InstArithmetic::Frem: 2116 llvm_unreachable("FP instruction with i64 type"); 2117 break; 2118 case InstArithmetic::Udiv: 2119 case InstArithmetic::Sdiv: 2120 case InstArithmetic::Urem: 2121 case InstArithmetic::Srem: 2122 llvm_unreachable("Call-helper-involved instruction for i64 type \ 2123 should have already been handled before"); 2124 break; 2125 } 2126 return; 2127 } 2128 if (isVectorType(Ty)) { 2129 // TODO: Trap on integer divide and integer modulo by zero. See: 2130 // https://code.google.com/p/nativeclient/issues/detail?id=3899 2131 if (llvm::isa<X86OperandMem>(Src1)) 2132 Src1 = legalizeToReg(Src1); 2133 switch (Instr->getOp()) { 2134 case InstArithmetic::_num: 2135 llvm_unreachable("Unknown arithmetic operator"); 2136 break; 2137 case InstArithmetic::Add: { 2138 Variable *T = makeReg(Ty); 2139 _movp(T, Src0); 2140 _padd(T, Src1); 2141 _movp(Dest, T); 2142 } break; 2143 case InstArithmetic::And: { 2144 Variable *T = makeReg(Ty); 2145 _movp(T, Src0); 2146 _pand(T, Src1); 2147 _movp(Dest, T); 2148 } break; 2149 case InstArithmetic::Or: { 2150 Variable *T = makeReg(Ty); 2151 _movp(T, Src0); 2152 _por(T, Src1); 2153 _movp(Dest, T); 2154 } break; 2155 case InstArithmetic::Xor: { 2156 Variable *T = makeReg(Ty); 2157 _movp(T, Src0); 2158 _pxor(T, Src1); 2159 _movp(Dest, T); 2160 } break; 2161 case InstArithmetic::Sub: { 2162 Variable *T = makeReg(Ty); 2163 _movp(T, Src0); 2164 _psub(T, Src1); 2165 _movp(Dest, T); 2166 } break; 2167 case InstArithmetic::Mul: { 2168 bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16; 2169 bool InstructionSetIsValidForPmull = 2170 Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1; 2171 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) { 2172 Variable *T = makeReg(Ty); 2173 _movp(T, Src0); 2174 _pmull(T, Src0 == Src1 ? T : Src1); 2175 _movp(Dest, T); 2176 } else if (Ty == IceType_v4i32) { 2177 // Lowering sequence: 2178 // Note: The mask arguments have index 0 on the left. 2179 // 2180 // movups T1, Src0 2181 // pshufd T2, Src0, {1,0,3,0} 2182 // pshufd T3, Src1, {1,0,3,0} 2183 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} 2184 // pmuludq T1, Src1 2185 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} 2186 // pmuludq T2, T3 2187 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} 2188 // shufps T1, T2, {0,2,0,2} 2189 // pshufd T4, T1, {0,2,1,3} 2190 // movups Dest, T4 2191 2192 // Mask that directs pshufd to create a vector with entries 2193 // Src[1, 0, 3, 0] 2194 constexpr unsigned Constant1030 = 0x31; 2195 Constant *Mask1030 = Ctx->getConstantInt32(Constant1030); 2196 // Mask that directs shufps to create a vector with entries 2197 // Dest[0, 2], Src[0, 2] 2198 constexpr unsigned Mask0202 = 0x88; 2199 // Mask that directs pshufd to create a vector with entries 2200 // Src[0, 2, 1, 3] 2201 constexpr unsigned Mask0213 = 0xd8; 2202 Variable *T1 = makeReg(IceType_v4i32); 2203 Variable *T2 = makeReg(IceType_v4i32); 2204 Variable *T3 = makeReg(IceType_v4i32); 2205 Variable *T4 = makeReg(IceType_v4i32); 2206 _movp(T1, Src0); 2207 _pshufd(T2, Src0, Mask1030); 2208 _pshufd(T3, Src1, Mask1030); 2209 _pmuludq(T1, Src1); 2210 _pmuludq(T2, T3); 2211 _shufps(T1, T2, Ctx->getConstantInt32(Mask0202)); 2212 _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213)); 2213 _movp(Dest, T4); 2214 } else if (Ty == IceType_v16i8) { 2215 llvm::report_fatal_error("Scalarized operation was expected"); 2216 } else { 2217 llvm::report_fatal_error("Invalid vector multiply type"); 2218 } 2219 } break; 2220 case InstArithmetic::Shl: { 2221 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized"); 2222 Variable *T = makeReg(Ty); 2223 _movp(T, Src0); 2224 _psll(T, Src1); 2225 _movp(Dest, T); 2226 } break; 2227 case InstArithmetic::Lshr: { 2228 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized"); 2229 Variable *T = makeReg(Ty); 2230 _movp(T, Src0); 2231 _psrl(T, Src1); 2232 _movp(Dest, T); 2233 } break; 2234 case InstArithmetic::Ashr: { 2235 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized"); 2236 Variable *T = makeReg(Ty); 2237 _movp(T, Src0); 2238 _psra(T, Src1); 2239 _movp(Dest, T); 2240 } break; 2241 case InstArithmetic::Udiv: 2242 case InstArithmetic::Urem: 2243 case InstArithmetic::Sdiv: 2244 case InstArithmetic::Srem: 2245 llvm::report_fatal_error("Scalarized operation was expected"); 2246 break; 2247 case InstArithmetic::Fadd: { 2248 Variable *T = makeReg(Ty); 2249 _movp(T, Src0); 2250 _addps(T, Src1); 2251 _movp(Dest, T); 2252 } break; 2253 case InstArithmetic::Fsub: { 2254 Variable *T = makeReg(Ty); 2255 _movp(T, Src0); 2256 _subps(T, Src1); 2257 _movp(Dest, T); 2258 } break; 2259 case InstArithmetic::Fmul: { 2260 Variable *T = makeReg(Ty); 2261 _movp(T, Src0); 2262 _mulps(T, Src0 == Src1 ? T : Src1); 2263 _movp(Dest, T); 2264 } break; 2265 case InstArithmetic::Fdiv: { 2266 Variable *T = makeReg(Ty); 2267 _movp(T, Src0); 2268 _divps(T, Src1); 2269 _movp(Dest, T); 2270 } break; 2271 case InstArithmetic::Frem: 2272 llvm::report_fatal_error("Scalarized operation was expected"); 2273 break; 2274 } 2275 return; 2276 } 2277 Variable *T_edx = nullptr; 2278 Variable *T = nullptr; 2279 switch (Instr->getOp()) { 2280 case InstArithmetic::_num: 2281 llvm_unreachable("Unknown arithmetic operator"); 2282 break; 2283 case InstArithmetic::Add: { 2284 const bool ValidType = 2285 Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit); 2286 auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1)); 2287 const bool ValidKind = 2288 Const != nullptr && (llvm::isa<ConstantInteger32>(Const) || 2289 llvm::isa<ConstantRelocatable>(Const)); 2290 if (getFlags().getAggressiveLea() && ValidType && ValidKind) { 2291 auto *Var = legalizeToReg(Src0); 2292 auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const); 2293 T = makeReg(Ty); 2294 _lea(T, _sandbox_mem_reference(Mem)); 2295 _mov(Dest, T); 2296 break; 2297 } 2298 _mov(T, Src0); 2299 _add(T, Src1); 2300 _mov(Dest, T); 2301 } break; 2302 case InstArithmetic::And: 2303 _mov(T, Src0); 2304 _and(T, Src1); 2305 _mov(Dest, T); 2306 break; 2307 case InstArithmetic::Or: 2308 _mov(T, Src0); 2309 _or(T, Src1); 2310 _mov(Dest, T); 2311 break; 2312 case InstArithmetic::Xor: 2313 _mov(T, Src0); 2314 _xor(T, Src1); 2315 _mov(Dest, T); 2316 break; 2317 case InstArithmetic::Sub: 2318 _mov(T, Src0); 2319 _sub(T, Src1); 2320 _mov(Dest, T); 2321 break; 2322 case InstArithmetic::Mul: 2323 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { 2324 if (optimizeScalarMul(Dest, Src0, C->getValue())) 2325 return; 2326 } 2327 // The 8-bit version of imul only allows the form "imul r/m8" where T must 2328 // be in al. 2329 if (isByteSizedArithType(Ty)) { 2330 _mov(T, Src0, Traits::RegisterSet::Reg_al); 2331 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2332 _imul(T, Src0 == Src1 ? T : Src1); 2333 _mov(Dest, T); 2334 } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) { 2335 T = makeReg(Ty); 2336 Src0 = legalize(Src0, Legal_Reg | Legal_Mem); 2337 _imul_imm(T, Src0, ImmConst); 2338 _mov(Dest, T); 2339 } else { 2340 _mov(T, Src0); 2341 // No need to legalize Src1 to Reg | Mem because the Imm case is handled 2342 // already by the ConstantInteger32 case above. 2343 _imul(T, Src0 == Src1 ? T : Src1); 2344 _mov(Dest, T); 2345 } 2346 break; 2347 case InstArithmetic::Shl: 2348 _mov(T, Src0); 2349 if (!llvm::isa<ConstantInteger32>(Src1) && 2350 !llvm::isa<ConstantInteger64>(Src1)) 2351 Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl); 2352 _shl(T, Src1); 2353 _mov(Dest, T); 2354 break; 2355 case InstArithmetic::Lshr: 2356 _mov(T, Src0); 2357 if (!llvm::isa<ConstantInteger32>(Src1) && 2358 !llvm::isa<ConstantInteger64>(Src1)) 2359 Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl); 2360 _shr(T, Src1); 2361 _mov(Dest, T); 2362 break; 2363 case InstArithmetic::Ashr: 2364 _mov(T, Src0); 2365 if (!llvm::isa<ConstantInteger32>(Src1) && 2366 !llvm::isa<ConstantInteger64>(Src1)) 2367 Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl); 2368 _sar(T, Src1); 2369 _mov(Dest, T); 2370 break; 2371 case InstArithmetic::Udiv: { 2372 // div and idiv are the few arithmetic operators that do not allow 2373 // immediates as the operand. 2374 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2375 RegNumT Eax; 2376 RegNumT Edx; 2377 switch (Ty) { 2378 default: 2379 llvm::report_fatal_error("Bad type for udiv"); 2380 case IceType_i64: 2381 Eax = Traits::getRaxOrDie(); 2382 Edx = Traits::getRdxOrDie(); 2383 break; 2384 case IceType_i32: 2385 Eax = Traits::RegisterSet::Reg_eax; 2386 Edx = Traits::RegisterSet::Reg_edx; 2387 break; 2388 case IceType_i16: 2389 Eax = Traits::RegisterSet::Reg_ax; 2390 Edx = Traits::RegisterSet::Reg_dx; 2391 break; 2392 case IceType_i8: 2393 Eax = Traits::RegisterSet::Reg_al; 2394 Edx = Traits::RegisterSet::Reg_ah; 2395 break; 2396 } 2397 T_edx = makeReg(Ty, Edx); 2398 _mov(T, Src0, Eax); 2399 _mov(T_edx, Ctx->getConstantZero(Ty)); 2400 _div(T_edx, Src1, T); 2401 _redefined(Context.insert<InstFakeDef>(T, T_edx)); 2402 _mov(Dest, T); 2403 } break; 2404 case InstArithmetic::Sdiv: 2405 // TODO(stichnot): Enable this after doing better performance and cross 2406 // testing. 2407 if (false && Func->getOptLevel() >= Opt_1) { 2408 // Optimize division by constant power of 2, but not for Om1 or O0, just 2409 // to keep things simple there. 2410 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { 2411 const int32_t Divisor = C->getValue(); 2412 const uint32_t UDivisor = Divisor; 2413 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { 2414 uint32_t LogDiv = llvm::Log2_32(UDivisor); 2415 // LLVM does the following for dest=src/(1<<log): 2416 // t=src 2417 // sar t,typewidth-1 // -1 if src is negative, 0 if not 2418 // shr t,typewidth-log 2419 // add t,src 2420 // sar t,log 2421 // dest=t 2422 uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty); 2423 _mov(T, Src0); 2424 // If for some reason we are dividing by 1, just treat it like an 2425 // assignment. 2426 if (LogDiv > 0) { 2427 // The initial sar is unnecessary when dividing by 2. 2428 if (LogDiv > 1) 2429 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); 2430 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); 2431 _add(T, Src0); 2432 _sar(T, Ctx->getConstantInt(Ty, LogDiv)); 2433 } 2434 _mov(Dest, T); 2435 return; 2436 } 2437 } 2438 } 2439 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2440 switch (Ty) { 2441 default: 2442 llvm::report_fatal_error("Bad type for sdiv"); 2443 case IceType_i64: 2444 T_edx = makeReg(Ty, Traits::getRdxOrDie()); 2445 _mov(T, Src0, Traits::getRaxOrDie()); 2446 break; 2447 case IceType_i32: 2448 T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx); 2449 _mov(T, Src0, Traits::RegisterSet::Reg_eax); 2450 break; 2451 case IceType_i16: 2452 T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx); 2453 _mov(T, Src0, Traits::RegisterSet::Reg_ax); 2454 break; 2455 case IceType_i8: 2456 T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax); 2457 _mov(T, Src0, Traits::RegisterSet::Reg_al); 2458 break; 2459 } 2460 _cbwdq(T_edx, T); 2461 _idiv(T_edx, Src1, T); 2462 _redefined(Context.insert<InstFakeDef>(T, T_edx)); 2463 _mov(Dest, T); 2464 break; 2465 case InstArithmetic::Urem: { 2466 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2467 RegNumT Eax; 2468 RegNumT Edx; 2469 switch (Ty) { 2470 default: 2471 llvm::report_fatal_error("Bad type for urem"); 2472 case IceType_i64: 2473 Eax = Traits::getRaxOrDie(); 2474 Edx = Traits::getRdxOrDie(); 2475 break; 2476 case IceType_i32: 2477 Eax = Traits::RegisterSet::Reg_eax; 2478 Edx = Traits::RegisterSet::Reg_edx; 2479 break; 2480 case IceType_i16: 2481 Eax = Traits::RegisterSet::Reg_ax; 2482 Edx = Traits::RegisterSet::Reg_dx; 2483 break; 2484 case IceType_i8: 2485 Eax = Traits::RegisterSet::Reg_al; 2486 Edx = Traits::RegisterSet::Reg_ah; 2487 break; 2488 } 2489 T_edx = makeReg(Ty, Edx); 2490 _mov(T_edx, Ctx->getConstantZero(Ty)); 2491 _mov(T, Src0, Eax); 2492 _div(T, Src1, T_edx); 2493 _redefined(Context.insert<InstFakeDef>(T_edx, T)); 2494 if (Ty == IceType_i8) { 2495 // Register ah must be moved into one of {al,bl,cl,dl} before it can be 2496 // moved into a general 8-bit register. 2497 auto *T_AhRcvr = makeReg(Ty); 2498 T_AhRcvr->setRegClass(RCX86_IsAhRcvr); 2499 _mov(T_AhRcvr, T_edx); 2500 T_edx = T_AhRcvr; 2501 } 2502 _mov(Dest, T_edx); 2503 } break; 2504 case InstArithmetic::Srem: { 2505 // TODO(stichnot): Enable this after doing better performance and cross 2506 // testing. 2507 if (false && Func->getOptLevel() >= Opt_1) { 2508 // Optimize mod by constant power of 2, but not for Om1 or O0, just to 2509 // keep things simple there. 2510 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { 2511 const int32_t Divisor = C->getValue(); 2512 const uint32_t UDivisor = Divisor; 2513 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { 2514 uint32_t LogDiv = llvm::Log2_32(UDivisor); 2515 // LLVM does the following for dest=src%(1<<log): 2516 // t=src 2517 // sar t,typewidth-1 // -1 if src is negative, 0 if not 2518 // shr t,typewidth-log 2519 // add t,src 2520 // and t, -(1<<log) 2521 // sub t,src 2522 // neg t 2523 // dest=t 2524 uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty); 2525 // If for some reason we are dividing by 1, just assign 0. 2526 if (LogDiv == 0) { 2527 _mov(Dest, Ctx->getConstantZero(Ty)); 2528 return; 2529 } 2530 _mov(T, Src0); 2531 // The initial sar is unnecessary when dividing by 2. 2532 if (LogDiv > 1) 2533 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); 2534 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); 2535 _add(T, Src0); 2536 _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv))); 2537 _sub(T, Src0); 2538 _neg(T); 2539 _mov(Dest, T); 2540 return; 2541 } 2542 } 2543 } 2544 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2545 RegNumT Eax; 2546 RegNumT Edx; 2547 switch (Ty) { 2548 default: 2549 llvm::report_fatal_error("Bad type for srem"); 2550 case IceType_i64: 2551 Eax = Traits::getRaxOrDie(); 2552 Edx = Traits::getRdxOrDie(); 2553 break; 2554 case IceType_i32: 2555 Eax = Traits::RegisterSet::Reg_eax; 2556 Edx = Traits::RegisterSet::Reg_edx; 2557 break; 2558 case IceType_i16: 2559 Eax = Traits::RegisterSet::Reg_ax; 2560 Edx = Traits::RegisterSet::Reg_dx; 2561 break; 2562 case IceType_i8: 2563 Eax = Traits::RegisterSet::Reg_al; 2564 Edx = Traits::RegisterSet::Reg_ah; 2565 break; 2566 } 2567 T_edx = makeReg(Ty, Edx); 2568 _mov(T, Src0, Eax); 2569 _cbwdq(T_edx, T); 2570 _idiv(T, Src1, T_edx); 2571 _redefined(Context.insert<InstFakeDef>(T_edx, T)); 2572 if (Ty == IceType_i8) { 2573 // Register ah must be moved into one of {al,bl,cl,dl} before it can be 2574 // moved into a general 8-bit register. 2575 auto *T_AhRcvr = makeReg(Ty); 2576 T_AhRcvr->setRegClass(RCX86_IsAhRcvr); 2577 _mov(T_AhRcvr, T_edx); 2578 T_edx = T_AhRcvr; 2579 } 2580 _mov(Dest, T_edx); 2581 } break; 2582 case InstArithmetic::Fadd: 2583 _mov(T, Src0); 2584 _addss(T, Src1); 2585 _mov(Dest, T); 2586 break; 2587 case InstArithmetic::Fsub: 2588 _mov(T, Src0); 2589 _subss(T, Src1); 2590 _mov(Dest, T); 2591 break; 2592 case InstArithmetic::Fmul: 2593 _mov(T, Src0); 2594 _mulss(T, Src0 == Src1 ? T : Src1); 2595 _mov(Dest, T); 2596 break; 2597 case InstArithmetic::Fdiv: 2598 _mov(T, Src0); 2599 _divss(T, Src1); 2600 _mov(Dest, T); 2601 break; 2602 case InstArithmetic::Frem: 2603 llvm::report_fatal_error("Helper call was expected"); 2604 break; 2605 } 2606 } 2607 2608 template <typename TraitsType> 2609 void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) { 2610 Variable *Dest = Instr->getDest(); 2611 if (Dest->isRematerializable()) { 2612 Context.insert<InstFakeDef>(Dest); 2613 return; 2614 } 2615 Operand *Src = Instr->getSrc(0); 2616 assert(Dest->getType() == Src->getType()); 2617 lowerMove(Dest, Src, false); 2618 } 2619 2620 template <typename TraitsType> 2621 void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) { 2622 if (Br->isUnconditional()) { 2623 _br(Br->getTargetUnconditional()); 2624 return; 2625 } 2626 Operand *Cond = Br->getCondition(); 2627 2628 // Handle folding opportunities. 2629 if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) { 2630 assert(Producer->isDeleted()); 2631 switch (BoolFolding<Traits>::getProducerKind(Producer)) { 2632 default: 2633 break; 2634 case BoolFolding<Traits>::PK_Icmp32: 2635 case BoolFolding<Traits>::PK_Icmp64: { 2636 lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br); 2637 return; 2638 } 2639 case BoolFolding<Traits>::PK_Fcmp: { 2640 lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br); 2641 return; 2642 } 2643 case BoolFolding<Traits>::PK_Arith: { 2644 lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br); 2645 return; 2646 } 2647 } 2648 } 2649 Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem); 2650 Constant *Zero = Ctx->getConstantZero(IceType_i32); 2651 _cmp(Src0, Zero); 2652 _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse()); 2653 } 2654 2655 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining 2656 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't. 2657 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) { 2658 return S0 < S1 ? S1 : S0; 2659 } 2660 2661 template <typename TraitsType> 2662 void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) { 2663 // Common x86 calling convention lowering: 2664 // 2665 // * At the point before the call, the stack must be aligned to 16 bytes. 2666 // 2667 // * Non-register arguments are pushed onto the stack in right-to-left order, 2668 // such that the left-most argument ends up on the top of the stack at the 2669 // lowest memory address. 2670 // 2671 // * Stack arguments of vector type are aligned to start at the next highest 2672 // multiple of 16 bytes. Other stack arguments are aligned to the next word 2673 // size boundary (4 or 8 bytes, respectively). 2674 RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment, 2675 Traits::X86_STACK_ALIGNMENT_BYTES); 2676 2677 constexpr SizeT MaxOperands = 2678 constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS); 2679 using OperandList = llvm::SmallVector<Operand *, MaxOperands>; 2680 2681 OperandList XmmArgs; 2682 llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices; 2683 CfgVector<std::pair<const Type, Operand *>> GprArgs; 2684 CfgVector<SizeT> GprArgIndices; 2685 OperandList StackArgs, StackArgLocations; 2686 uint32_t ParameterAreaSizeBytes = 0; 2687 2688 ParameterAreaSizeBytes += getShadowStoreSize<Traits>(); 2689 2690 // Classify each argument operand according to the location where the argument 2691 // is passed. 2692 for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) { 2693 Operand *Arg = Instr->getArg(i); 2694 const Type Ty = Arg->getType(); 2695 // The PNaCl ABI requires the width of arguments to be at least 32 bits. 2696 assert(typeWidthInBytes(Ty) >= 4); 2697 if (isVectorType(Ty) && 2698 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size())) 2699 .hasValue()) { 2700 XmmArgs.push_back(Arg); 2701 XmmArgIndices.push_back(i); 2702 } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM && 2703 Traits::getRegisterForXmmArgNum( 2704 Traits::getArgIndex(i, XmmArgs.size())) 2705 .hasValue()) { 2706 XmmArgs.push_back(Arg); 2707 XmmArgIndices.push_back(i); 2708 } else if (isScalarIntegerType(Ty) && 2709 Traits::getRegisterForGprArgNum( 2710 Ty, Traits::getArgIndex(i, GprArgs.size())) 2711 .hasValue()) { 2712 GprArgs.emplace_back(Ty, Arg); 2713 GprArgIndices.push_back(i); 2714 } else { 2715 // Place on stack. 2716 StackArgs.push_back(Arg); 2717 if (isVectorType(Arg->getType())) { 2718 ParameterAreaSizeBytes = 2719 Traits::applyStackAlignment(ParameterAreaSizeBytes); 2720 } 2721 Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType); 2722 Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes); 2723 StackArgLocations.push_back( 2724 Traits::X86OperandMem::create(Func, Ty, esp, Loc)); 2725 ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType()); 2726 } 2727 } 2728 // Ensure there is enough space for the fstp/movs for floating returns. 2729 Variable *Dest = Instr->getDest(); 2730 const Type DestTy = Dest ? Dest->getType() : IceType_void; 2731 if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { 2732 if (isScalarFloatingType(DestTy)) { 2733 ParameterAreaSizeBytes = 2734 std::max(static_cast<size_t>(ParameterAreaSizeBytes), 2735 typeWidthInBytesOnStack(DestTy)); 2736 } 2737 } 2738 // Adjust the parameter area so that the stack is aligned. It is assumed that 2739 // the stack is already aligned at the start of the calling sequence. 2740 ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes); 2741 assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes()); 2742 // Copy arguments that are passed on the stack to the appropriate stack 2743 // locations. We make sure legalize() is called on each argument at this 2744 // point, to allow availabilityGet() to work. 2745 for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) { 2746 lowerStore( 2747 InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i])); 2748 } 2749 // Copy arguments to be passed in registers to the appropriate registers. 2750 for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) { 2751 XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]), 2752 Traits::getRegisterForXmmArgNum( 2753 Traits::getArgIndex(XmmArgIndices[i], i))); 2754 } 2755 // Materialize moves for arguments passed in GPRs. 2756 for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) { 2757 const Type SignatureTy = GprArgs[i].first; 2758 Operand *Arg = 2759 legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable); 2760 GprArgs[i].second = legalizeToReg( 2761 Arg, Traits::getRegisterForGprArgNum( 2762 Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i))); 2763 assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32); 2764 assert(SignatureTy == Arg->getType()); 2765 (void)SignatureTy; 2766 } 2767 // Generate a FakeUse of register arguments so that they do not get dead code 2768 // eliminated as a result of the FakeKill of scratch registers after the call. 2769 // These need to be right before the call instruction. 2770 for (auto *Arg : XmmArgs) { 2771 Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg)); 2772 } 2773 for (auto &ArgPair : GprArgs) { 2774 Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second)); 2775 } 2776 // Generate the call instruction. Assign its result to a temporary with high 2777 // register allocation weight. 2778 // ReturnReg doubles as ReturnRegLo as necessary. 2779 Variable *ReturnReg = nullptr; 2780 Variable *ReturnRegHi = nullptr; 2781 if (Dest) { 2782 switch (DestTy) { 2783 case IceType_NUM: 2784 case IceType_void: 2785 case IceType_i1: 2786 case IceType_i8: 2787 case IceType_i16: 2788 llvm::report_fatal_error("Invalid Call dest type"); 2789 break; 2790 case IceType_i32: 2791 ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax); 2792 break; 2793 case IceType_i64: 2794 if (Traits::Is64Bit) { 2795 ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie()); 2796 } else { 2797 ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax); 2798 ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx); 2799 } 2800 break; 2801 case IceType_f32: 2802 case IceType_f64: 2803 if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { 2804 // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with 2805 // the fstp instruction. 2806 break; 2807 } 2808 // Fallthrough intended. 2809 case IceType_v4i1: 2810 case IceType_v8i1: 2811 case IceType_v16i1: 2812 case IceType_v16i8: 2813 case IceType_v8i16: 2814 case IceType_v4i32: 2815 case IceType_v4f32: 2816 ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0); 2817 break; 2818 } 2819 } 2820 // Emit the call to the function. 2821 Operand *CallTarget = 2822 legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs); 2823 size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0; 2824 Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs); 2825 // Keep the upper return register live on 32-bit platform. 2826 if (ReturnRegHi) 2827 Context.insert<InstFakeDef>(ReturnRegHi); 2828 // Mark the call as killing all the caller-save registers. 2829 Context.insert<InstFakeKill>(NewCall); 2830 // Handle x86-32 floating point returns. 2831 if (Dest != nullptr && isScalarFloatingType(DestTy) && 2832 !Traits::X86_PASS_SCALAR_FP_IN_XMM) { 2833 // Special treatment for an FP function which returns its result in st(0). 2834 // If Dest ends up being a physical xmm register, the fstp emit code will 2835 // route st(0) through the space reserved in the function argument area 2836 // we allocated. 2837 _fstp(Dest); 2838 // Create a fake use of Dest in case it actually isn't used, because st(0) 2839 // still needs to be popped. 2840 Context.insert<InstFakeUse>(Dest); 2841 } 2842 // Generate a FakeUse to keep the call live if necessary. 2843 if (Instr->hasSideEffects() && ReturnReg) { 2844 Context.insert<InstFakeUse>(ReturnReg); 2845 } 2846 // Process the return value, if any. 2847 if (Dest == nullptr) 2848 return; 2849 // Assign the result of the call to Dest. Route it through a temporary so 2850 // that the local register availability peephole can be subsequently used. 2851 Variable *Tmp = nullptr; 2852 if (isVectorType(DestTy)) { 2853 assert(ReturnReg && "Vector type requires a return register"); 2854 Tmp = makeReg(DestTy); 2855 _movp(Tmp, ReturnReg); 2856 _movp(Dest, Tmp); 2857 } else if (isScalarFloatingType(DestTy)) { 2858 if (Traits::X86_PASS_SCALAR_FP_IN_XMM) { 2859 assert(ReturnReg && "FP type requires a return register"); 2860 _mov(Tmp, ReturnReg); 2861 _mov(Dest, Tmp); 2862 } 2863 } else { 2864 assert(isScalarIntegerType(DestTy)); 2865 assert(ReturnReg && "Integer type requires a return register"); 2866 if (DestTy == IceType_i64 && !Traits::Is64Bit) { 2867 assert(ReturnRegHi && "64-bit type requires two return registers"); 2868 auto *Dest64On32 = llvm::cast<Variable64On32>(Dest); 2869 Variable *DestLo = Dest64On32->getLo(); 2870 Variable *DestHi = Dest64On32->getHi(); 2871 _mov(Tmp, ReturnReg); 2872 _mov(DestLo, Tmp); 2873 Variable *TmpHi = nullptr; 2874 _mov(TmpHi, ReturnRegHi); 2875 _mov(DestHi, TmpHi); 2876 } else { 2877 _mov(Tmp, ReturnReg); 2878 _mov(Dest, Tmp); 2879 } 2880 } 2881 } 2882 2883 template <typename TraitsType> 2884 void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) { 2885 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap) 2886 InstCast::OpKind CastKind = Instr->getCastKind(); 2887 Variable *Dest = Instr->getDest(); 2888 Type DestTy = Dest->getType(); 2889 switch (CastKind) { 2890 default: 2891 Func->setError("Cast type not supported"); 2892 return; 2893 case InstCast::Sext: { 2894 // Src0RM is the source operand legalized to physical register or memory, 2895 // but not immediate, since the relevant x86 native instructions don't 2896 // allow an immediate operand. If the operand is an immediate, we could 2897 // consider computing the strength-reduced result at translation time, but 2898 // we're unlikely to see something like that in the bitcode that the 2899 // optimizer wouldn't have already taken care of. 2900 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 2901 if (isVectorType(DestTy)) { 2902 if (DestTy == IceType_v16i8) { 2903 // onemask = materialize(1,1,...); dst = (src & onemask) > 0 2904 Variable *OneMask = makeVectorOfOnes(DestTy); 2905 Variable *T = makeReg(DestTy); 2906 _movp(T, Src0RM); 2907 _pand(T, OneMask); 2908 Variable *Zeros = makeVectorOfZeros(DestTy); 2909 _pcmpgt(T, Zeros); 2910 _movp(Dest, T); 2911 } else { 2912 /// width = width(elty) - 1; dest = (src << width) >> width 2913 SizeT ShiftAmount = 2914 Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 2915 1; 2916 Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount); 2917 Variable *T = makeReg(DestTy); 2918 _movp(T, Src0RM); 2919 _psll(T, ShiftConstant); 2920 _psra(T, ShiftConstant); 2921 _movp(Dest, T); 2922 } 2923 } else if (!Traits::Is64Bit && DestTy == IceType_i64) { 2924 // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2 2925 Constant *Shift = Ctx->getConstantInt32(31); 2926 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 2927 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 2928 Variable *T_Lo = makeReg(DestLo->getType()); 2929 if (Src0RM->getType() == IceType_i32) { 2930 _mov(T_Lo, Src0RM); 2931 } else if (Src0RM->getType() == IceType_i1) { 2932 _movzx(T_Lo, Src0RM); 2933 _shl(T_Lo, Shift); 2934 _sar(T_Lo, Shift); 2935 } else { 2936 _movsx(T_Lo, Src0RM); 2937 } 2938 _mov(DestLo, T_Lo); 2939 Variable *T_Hi = nullptr; 2940 _mov(T_Hi, T_Lo); 2941 if (Src0RM->getType() != IceType_i1) 2942 // For i1, the sar instruction is already done above. 2943 _sar(T_Hi, Shift); 2944 _mov(DestHi, T_Hi); 2945 } else if (Src0RM->getType() == IceType_i1) { 2946 // t1 = src 2947 // shl t1, dst_bitwidth - 1 2948 // sar t1, dst_bitwidth - 1 2949 // dst = t1 2950 size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy); 2951 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1); 2952 Variable *T = makeReg(DestTy); 2953 if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) { 2954 _mov(T, Src0RM); 2955 } else { 2956 // Widen the source using movsx or movzx. (It doesn't matter which one, 2957 // since the following shl/sar overwrite the bits.) 2958 _movzx(T, Src0RM); 2959 } 2960 _shl(T, ShiftAmount); 2961 _sar(T, ShiftAmount); 2962 _mov(Dest, T); 2963 } else { 2964 // t1 = movsx src; dst = t1 2965 Variable *T = makeReg(DestTy); 2966 _movsx(T, Src0RM); 2967 _mov(Dest, T); 2968 } 2969 break; 2970 } 2971 case InstCast::Zext: { 2972 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 2973 if (isVectorType(DestTy)) { 2974 // onemask = materialize(1,1,...); dest = onemask & src 2975 Variable *OneMask = makeVectorOfOnes(DestTy); 2976 Variable *T = makeReg(DestTy); 2977 _movp(T, Src0RM); 2978 _pand(T, OneMask); 2979 _movp(Dest, T); 2980 } else if (!Traits::Is64Bit && DestTy == IceType_i64) { 2981 // t1=movzx src; dst.lo=t1; dst.hi=0 2982 Constant *Zero = Ctx->getConstantZero(IceType_i32); 2983 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 2984 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 2985 Variable *Tmp = makeReg(DestLo->getType()); 2986 if (Src0RM->getType() == IceType_i32) { 2987 _mov(Tmp, Src0RM); 2988 } else { 2989 _movzx(Tmp, Src0RM); 2990 } 2991 _mov(DestLo, Tmp); 2992 _mov(DestHi, Zero); 2993 } else if (Src0RM->getType() == IceType_i1) { 2994 // t = Src0RM; Dest = t 2995 Variable *T = nullptr; 2996 if (DestTy == IceType_i8) { 2997 _mov(T, Src0RM); 2998 } else { 2999 assert(DestTy != IceType_i1); 3000 assert(Traits::Is64Bit || DestTy != IceType_i64); 3001 // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter. 3002 // In x86-64 we need to widen T to 64-bits to ensure that T -- if 3003 // written to the stack (i.e., in -Om1) will be fully zero-extended. 3004 T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32); 3005 _movzx(T, Src0RM); 3006 } 3007 _mov(Dest, T); 3008 } else { 3009 // t1 = movzx src; dst = t1 3010 Variable *T = makeReg(DestTy); 3011 _movzx(T, Src0RM); 3012 _mov(Dest, T); 3013 } 3014 break; 3015 } 3016 case InstCast::Trunc: { 3017 if (isVectorType(DestTy)) { 3018 // onemask = materialize(1,1,...); dst = src & onemask 3019 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3020 Type Src0Ty = Src0RM->getType(); 3021 Variable *OneMask = makeVectorOfOnes(Src0Ty); 3022 Variable *T = makeReg(DestTy); 3023 _movp(T, Src0RM); 3024 _pand(T, OneMask); 3025 _movp(Dest, T); 3026 } else if (DestTy == IceType_i1 || DestTy == IceType_i8) { 3027 // Make sure we truncate from and into valid registers. 3028 Operand *Src0 = legalizeUndef(Instr->getSrc(0)); 3029 if (!Traits::Is64Bit && Src0->getType() == IceType_i64) 3030 Src0 = loOperand(Src0); 3031 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3032 Variable *T = copyToReg8(Src0RM); 3033 if (DestTy == IceType_i1) 3034 _and(T, Ctx->getConstantInt1(1)); 3035 _mov(Dest, T); 3036 } else { 3037 Operand *Src0 = legalizeUndef(Instr->getSrc(0)); 3038 if (!Traits::Is64Bit && Src0->getType() == IceType_i64) 3039 Src0 = loOperand(Src0); 3040 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3041 // t1 = trunc Src0RM; Dest = t1 3042 Variable *T = makeReg(DestTy); 3043 _mov(T, Src0RM); 3044 _mov(Dest, T); 3045 } 3046 break; 3047 } 3048 case InstCast::Fptrunc: 3049 case InstCast::Fpext: { 3050 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3051 // t1 = cvt Src0RM; Dest = t1 3052 Variable *T = makeReg(DestTy); 3053 _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float); 3054 _mov(Dest, T); 3055 break; 3056 } 3057 case InstCast::Fptosi: 3058 if (isVectorType(DestTy)) { 3059 assert(DestTy == IceType_v4i32); 3060 assert(Instr->getSrc(0)->getType() == IceType_v4f32); 3061 Operand *Src0R = legalizeToReg(Instr->getSrc(0)); 3062 Variable *T = makeReg(DestTy); 3063 _cvt(T, Src0R, Traits::Insts::Cvt::Tps2dq); 3064 _movp(Dest, T); 3065 } else if (!Traits::Is64Bit && DestTy == IceType_i64) { 3066 llvm::report_fatal_error("Helper call was expected"); 3067 } else { 3068 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3069 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type 3070 Variable *T_1 = nullptr; 3071 if (Traits::Is64Bit && DestTy == IceType_i64) { 3072 T_1 = makeReg(IceType_i64); 3073 } else { 3074 assert(DestTy != IceType_i64); 3075 T_1 = makeReg(IceType_i32); 3076 } 3077 // cvt() requires its integer argument to be a GPR. 3078 Variable *T_2 = makeReg(DestTy); 3079 if (isByteSizedType(DestTy)) { 3080 assert(T_1->getType() == IceType_i32); 3081 T_1->setRegClass(RCX86_Is32To8); 3082 T_2->setRegClass(RCX86_IsTrunc8Rcvr); 3083 } 3084 _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si); 3085 _mov(T_2, T_1); // T_1 and T_2 may have different integer types 3086 if (DestTy == IceType_i1) 3087 _and(T_2, Ctx->getConstantInt1(1)); 3088 _mov(Dest, T_2); 3089 } 3090 break; 3091 case InstCast::Fptoui: 3092 if (isVectorType(DestTy)) { 3093 llvm::report_fatal_error("Helper call was expected"); 3094 } else if (DestTy == IceType_i64 || 3095 (!Traits::Is64Bit && DestTy == IceType_i32)) { 3096 llvm::report_fatal_error("Helper call was expected"); 3097 } else { 3098 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3099 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type 3100 assert(DestTy != IceType_i64); 3101 Variable *T_1 = nullptr; 3102 if (Traits::Is64Bit && DestTy == IceType_i32) { 3103 T_1 = makeReg(IceType_i64); 3104 } else { 3105 assert(DestTy != IceType_i32); 3106 T_1 = makeReg(IceType_i32); 3107 } 3108 Variable *T_2 = makeReg(DestTy); 3109 if (isByteSizedType(DestTy)) { 3110 assert(T_1->getType() == IceType_i32); 3111 T_1->setRegClass(RCX86_Is32To8); 3112 T_2->setRegClass(RCX86_IsTrunc8Rcvr); 3113 } 3114 _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si); 3115 _mov(T_2, T_1); // T_1 and T_2 may have different integer types 3116 if (DestTy == IceType_i1) 3117 _and(T_2, Ctx->getConstantInt1(1)); 3118 _mov(Dest, T_2); 3119 } 3120 break; 3121 case InstCast::Sitofp: 3122 if (isVectorType(DestTy)) { 3123 assert(DestTy == IceType_v4f32); 3124 assert(Instr->getSrc(0)->getType() == IceType_v4i32); 3125 Operand *Src0R = legalizeToReg(Instr->getSrc(0)); 3126 Variable *T = makeReg(DestTy); 3127 _cvt(T, Src0R, Traits::Insts::Cvt::Dq2ps); 3128 _movp(Dest, T); 3129 } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) { 3130 llvm::report_fatal_error("Helper call was expected"); 3131 } else { 3132 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3133 // Sign-extend the operand. 3134 // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2 3135 Variable *T_1 = nullptr; 3136 if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) { 3137 T_1 = makeReg(IceType_i64); 3138 } else { 3139 assert(Src0RM->getType() != IceType_i64); 3140 T_1 = makeReg(IceType_i32); 3141 } 3142 Variable *T_2 = makeReg(DestTy); 3143 if (Src0RM->getType() == T_1->getType()) 3144 _mov(T_1, Src0RM); 3145 else 3146 _movsx(T_1, Src0RM); 3147 _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss); 3148 _mov(Dest, T_2); 3149 } 3150 break; 3151 case InstCast::Uitofp: { 3152 Operand *Src0 = Instr->getSrc(0); 3153 if (isVectorType(Src0->getType())) { 3154 llvm::report_fatal_error("Helper call was expected"); 3155 } else if (Src0->getType() == IceType_i64 || 3156 (!Traits::Is64Bit && Src0->getType() == IceType_i32)) { 3157 llvm::report_fatal_error("Helper call was expected"); 3158 } else { 3159 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3160 // Zero-extend the operand. 3161 // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2 3162 Variable *T_1 = nullptr; 3163 if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) { 3164 T_1 = makeReg(IceType_i64); 3165 } else { 3166 assert(Src0RM->getType() != IceType_i64); 3167 assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32); 3168 T_1 = makeReg(IceType_i32); 3169 } 3170 Variable *T_2 = makeReg(DestTy); 3171 if (Src0RM->getType() == T_1->getType()) 3172 _mov(T_1, Src0RM); 3173 else 3174 _movzx(T_1, Src0RM)->setMustKeep(); 3175 _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss); 3176 _mov(Dest, T_2); 3177 } 3178 break; 3179 } 3180 case InstCast::Bitcast: { 3181 Operand *Src0 = Instr->getSrc(0); 3182 if (DestTy == Src0->getType()) { 3183 auto *Assign = InstAssign::create(Func, Dest, Src0); 3184 lowerAssign(Assign); 3185 return; 3186 } 3187 switch (DestTy) { 3188 default: 3189 llvm_unreachable("Unexpected Bitcast dest type"); 3190 case IceType_i8: { 3191 llvm::report_fatal_error("Helper call was expected"); 3192 } break; 3193 case IceType_i16: { 3194 llvm::report_fatal_error("Helper call was expected"); 3195 } break; 3196 case IceType_i32: 3197 case IceType_f32: { 3198 Variable *Src0R = legalizeToReg(Src0); 3199 Variable *T = makeReg(DestTy); 3200 _movd(T, Src0R); 3201 _mov(Dest, T); 3202 } break; 3203 case IceType_i64: { 3204 assert(Src0->getType() == IceType_f64); 3205 if (Traits::Is64Bit) { 3206 Variable *Src0R = legalizeToReg(Src0); 3207 Variable *T = makeReg(IceType_i64); 3208 _movd(T, Src0R); 3209 _mov(Dest, T); 3210 } else { 3211 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3212 // a.i64 = bitcast b.f64 ==> 3213 // s.f64 = spill b.f64 3214 // t_lo.i32 = lo(s.f64) 3215 // a_lo.i32 = t_lo.i32 3216 // t_hi.i32 = hi(s.f64) 3217 // a_hi.i32 = t_hi.i32 3218 Operand *SpillLo, *SpillHi; 3219 if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) { 3220 Variable *Spill = Func->makeVariable(IceType_f64); 3221 Spill->setLinkedTo(Src0Var); 3222 Spill->setMustNotHaveReg(); 3223 _movq(Spill, Src0RM); 3224 SpillLo = Traits::VariableSplit::create(Func, Spill, 3225 Traits::VariableSplit::Low); 3226 SpillHi = Traits::VariableSplit::create(Func, Spill, 3227 Traits::VariableSplit::High); 3228 } else { 3229 SpillLo = loOperand(Src0RM); 3230 SpillHi = hiOperand(Src0RM); 3231 } 3232 3233 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 3234 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 3235 Variable *T_Lo = makeReg(IceType_i32); 3236 Variable *T_Hi = makeReg(IceType_i32); 3237 3238 _mov(T_Lo, SpillLo); 3239 _mov(DestLo, T_Lo); 3240 _mov(T_Hi, SpillHi); 3241 _mov(DestHi, T_Hi); 3242 } 3243 } break; 3244 case IceType_f64: { 3245 assert(Src0->getType() == IceType_i64); 3246 if (Traits::Is64Bit) { 3247 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3248 Variable *T = makeReg(IceType_f64); 3249 _movd(T, Src0RM); 3250 _mov(Dest, T); 3251 } else { 3252 Src0 = legalize(Src0); 3253 if (llvm::isa<X86OperandMem>(Src0)) { 3254 Variable *T = makeReg(DestTy); 3255 _movq(T, Src0); 3256 _movq(Dest, T); 3257 break; 3258 } 3259 // a.f64 = bitcast b.i64 ==> 3260 // t_lo.i32 = b_lo.i32 3261 // FakeDef(s.f64) 3262 // lo(s.f64) = t_lo.i32 3263 // t_hi.i32 = b_hi.i32 3264 // hi(s.f64) = t_hi.i32 3265 // a.f64 = s.f64 3266 Variable *Spill = Func->makeVariable(IceType_f64); 3267 Spill->setLinkedTo(Dest); 3268 Spill->setMustNotHaveReg(); 3269 3270 Variable *T_Lo = nullptr, *T_Hi = nullptr; 3271 auto *SpillLo = Traits::VariableSplit::create( 3272 Func, Spill, Traits::VariableSplit::Low); 3273 auto *SpillHi = Traits::VariableSplit::create( 3274 Func, Spill, Traits::VariableSplit::High); 3275 _mov(T_Lo, loOperand(Src0)); 3276 // Technically, the Spill is defined after the _store happens, but 3277 // SpillLo is considered a "use" of Spill so define Spill before it is 3278 // used. 3279 Context.insert<InstFakeDef>(Spill); 3280 _store(T_Lo, SpillLo); 3281 _mov(T_Hi, hiOperand(Src0)); 3282 _store(T_Hi, SpillHi); 3283 _movq(Dest, Spill); 3284 } 3285 } break; 3286 case IceType_v8i1: { 3287 llvm::report_fatal_error("Helper call was expected"); 3288 } break; 3289 case IceType_v16i1: { 3290 llvm::report_fatal_error("Helper call was expected"); 3291 } break; 3292 case IceType_v8i16: 3293 case IceType_v16i8: 3294 case IceType_v4i32: 3295 case IceType_v4f32: { 3296 if (Src0->getType() == IceType_i32) { 3297 // Bitcast requires equal type sizes, which isn't strictly the case 3298 // between scalars and vectors, but to emulate v4i8 vectors one has to 3299 // use v16i8 vectors. 3300 assert(getFlags().getApplicationBinaryInterface() != ABI_PNaCl && 3301 "PNaCl only supports real 128-bit vectors"); 3302 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3303 Variable *T = makeReg(DestTy); 3304 _movd(T, Src0RM); 3305 _mov(Dest, T); 3306 } else { 3307 _movp(Dest, legalizeToReg(Src0)); 3308 } 3309 } break; 3310 } 3311 break; 3312 } 3313 } 3314 } 3315 3316 template <typename TraitsType> 3317 void TargetX86Base<TraitsType>::lowerExtractElement( 3318 const InstExtractElement *Instr) { 3319 Operand *SourceVectNotLegalized = Instr->getSrc(0); 3320 auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1)); 3321 // Only constant indices are allowed in PNaCl IR. 3322 assert(ElementIndex); 3323 3324 unsigned Index = ElementIndex->getValue(); 3325 Type Ty = SourceVectNotLegalized->getType(); 3326 Type ElementTy = typeElementType(Ty); 3327 Type InVectorElementTy = Traits::getInVectorElementType(Ty); 3328 3329 // TODO(wala): Determine the best lowering sequences for each type. 3330 bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 || 3331 (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32); 3332 Variable *ExtractedElementR = 3333 makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy); 3334 if (CanUsePextr) { 3335 // Use pextrb, pextrw, or pextrd. The "b" and "w" versions clear the upper 3336 // bits of the destination register, so we represent this by always 3337 // extracting into an i32 register. The _mov into Dest below will do 3338 // truncation as necessary. 3339 Constant *Mask = Ctx->getConstantInt32(Index); 3340 Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized); 3341 _pextr(ExtractedElementR, SourceVectR, Mask); 3342 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { 3343 // Use pshufd and movd/movss. 3344 Variable *T = nullptr; 3345 if (Index) { 3346 // The shuffle only needs to occur if the element to be extracted is not 3347 // at the lowest index. 3348 Constant *Mask = Ctx->getConstantInt32(Index); 3349 T = makeReg(Ty); 3350 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask); 3351 } else { 3352 T = legalizeToReg(SourceVectNotLegalized); 3353 } 3354 3355 if (InVectorElementTy == IceType_i32) { 3356 _movd(ExtractedElementR, T); 3357 } else { // Ty == IceType_f32 3358 // TODO(wala): _movss is only used here because _mov does not allow a 3359 // vector source and a scalar destination. _mov should be able to be 3360 // used here. 3361 // _movss is a binary instruction, so the FakeDef is needed to keep the 3362 // live range analysis consistent. 3363 Context.insert<InstFakeDef>(ExtractedElementR); 3364 _movss(ExtractedElementR, T); 3365 } 3366 } else { 3367 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); 3368 // Spill the value to a stack slot and do the extraction in memory. 3369 // 3370 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support 3371 // for legalizing to mem is implemented. 3372 Variable *Slot = Func->makeVariable(Ty); 3373 Slot->setMustNotHaveReg(); 3374 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); 3375 3376 // Compute the location of the element in memory. 3377 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); 3378 X86OperandMem *Loc = 3379 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); 3380 _mov(ExtractedElementR, Loc); 3381 } 3382 3383 if (ElementTy == IceType_i1) { 3384 // Truncate extracted integers to i1s if necessary. 3385 Variable *T = makeReg(IceType_i1); 3386 InstCast *Cast = 3387 InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR); 3388 lowerCast(Cast); 3389 ExtractedElementR = T; 3390 } 3391 3392 // Copy the element to the destination. 3393 Variable *Dest = Instr->getDest(); 3394 _mov(Dest, ExtractedElementR); 3395 } 3396 3397 template <typename TraitsType> 3398 void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) { 3399 Variable *Dest = Fcmp->getDest(); 3400 3401 if (isVectorType(Dest->getType())) { 3402 lowerFcmpVector(Fcmp); 3403 } else { 3404 constexpr Inst *Consumer = nullptr; 3405 lowerFcmpAndConsumer(Fcmp, Consumer); 3406 } 3407 } 3408 3409 template <typename TraitsType> 3410 void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp, 3411 const Inst *Consumer) { 3412 Operand *Src0 = Fcmp->getSrc(0); 3413 Operand *Src1 = Fcmp->getSrc(1); 3414 Variable *Dest = Fcmp->getDest(); 3415 3416 if (Consumer != nullptr) { 3417 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3418 if (lowerOptimizeFcmpSelect(Fcmp, Select)) 3419 return; 3420 } 3421 } 3422 3423 if (isVectorType(Dest->getType())) { 3424 lowerFcmp(Fcmp); 3425 if (Consumer != nullptr) 3426 lowerSelectVector(llvm::cast<InstSelect>(Consumer)); 3427 return; 3428 } 3429 3430 // Lowering a = fcmp cond, b, c 3431 // ucomiss b, c /* only if C1 != Br_None */ 3432 // /* but swap b,c order if SwapOperands==true */ 3433 // mov a, <default> 3434 // j<C1> label /* only if C1 != Br_None */ 3435 // j<C2> label /* only if C2 != Br_None */ 3436 // FakeUse(a) /* only if C1 != Br_None */ 3437 // mov a, !<default> /* only if C1 != Br_None */ 3438 // label: /* only if C1 != Br_None */ 3439 // 3440 // setcc lowering when C1 != Br_None && C2 == Br_None: 3441 // ucomiss b, c /* but swap b,c order if SwapOperands==true */ 3442 // setcc a, C1 3443 InstFcmp::FCond Condition = Fcmp->getCondition(); 3444 assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize); 3445 if (Traits::TableFcmp[Condition].SwapScalarOperands) 3446 std::swap(Src0, Src1); 3447 const bool HasC1 = (Traits::TableFcmp[Condition].C1 != Traits::Cond::Br_None); 3448 const bool HasC2 = (Traits::TableFcmp[Condition].C2 != Traits::Cond::Br_None); 3449 if (HasC1) { 3450 Src0 = legalize(Src0); 3451 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 3452 Variable *T = nullptr; 3453 _mov(T, Src0); 3454 _ucomiss(T, Src1RM); 3455 if (!HasC2) { 3456 assert(Traits::TableFcmp[Condition].Default); 3457 setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer); 3458 return; 3459 } 3460 } 3461 int32_t IntDefault = Traits::TableFcmp[Condition].Default; 3462 if (Consumer == nullptr) { 3463 Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault); 3464 _mov(Dest, Default); 3465 if (HasC1) { 3466 InstX86Label *Label = InstX86Label::create(Func, this); 3467 _br(Traits::TableFcmp[Condition].C1, Label); 3468 if (HasC2) { 3469 _br(Traits::TableFcmp[Condition].C2, Label); 3470 } 3471 Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault); 3472 _redefined(_mov(Dest, NonDefault)); 3473 Context.insert(Label); 3474 } 3475 return; 3476 } 3477 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3478 CfgNode *TrueSucc = Br->getTargetTrue(); 3479 CfgNode *FalseSucc = Br->getTargetFalse(); 3480 if (IntDefault != 0) 3481 std::swap(TrueSucc, FalseSucc); 3482 if (HasC1) { 3483 _br(Traits::TableFcmp[Condition].C1, FalseSucc); 3484 if (HasC2) { 3485 _br(Traits::TableFcmp[Condition].C2, FalseSucc); 3486 } 3487 _br(TrueSucc); 3488 return; 3489 } 3490 _br(FalseSucc); 3491 return; 3492 } 3493 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3494 Operand *SrcT = Select->getTrueOperand(); 3495 Operand *SrcF = Select->getFalseOperand(); 3496 Variable *SelectDest = Select->getDest(); 3497 if (IntDefault != 0) 3498 std::swap(SrcT, SrcF); 3499 lowerMove(SelectDest, SrcF, false); 3500 if (HasC1) { 3501 InstX86Label *Label = InstX86Label::create(Func, this); 3502 _br(Traits::TableFcmp[Condition].C1, Label); 3503 if (HasC2) { 3504 _br(Traits::TableFcmp[Condition].C2, Label); 3505 } 3506 static constexpr bool IsRedefinition = true; 3507 lowerMove(SelectDest, SrcT, IsRedefinition); 3508 Context.insert(Label); 3509 } 3510 return; 3511 } 3512 llvm::report_fatal_error("Unexpected consumer type"); 3513 } 3514 3515 template <typename TraitsType> 3516 void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) { 3517 Operand *Src0 = Fcmp->getSrc(0); 3518 Operand *Src1 = Fcmp->getSrc(1); 3519 Variable *Dest = Fcmp->getDest(); 3520 3521 if (!isVectorType(Dest->getType())) 3522 llvm::report_fatal_error("Expected vector compare"); 3523 3524 InstFcmp::FCond Condition = Fcmp->getCondition(); 3525 assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize); 3526 3527 if (Traits::TableFcmp[Condition].SwapVectorOperands) 3528 std::swap(Src0, Src1); 3529 3530 Variable *T = nullptr; 3531 3532 if (Condition == InstFcmp::True) { 3533 // makeVectorOfOnes() requires an integer vector type. 3534 T = makeVectorOfMinusOnes(IceType_v4i32); 3535 } else if (Condition == InstFcmp::False) { 3536 T = makeVectorOfZeros(Dest->getType()); 3537 } else { 3538 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3539 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 3540 if (llvm::isa<X86OperandMem>(Src1RM)) 3541 Src1RM = legalizeToReg(Src1RM); 3542 3543 switch (Condition) { 3544 default: { 3545 const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate; 3546 assert(Predicate != Traits::Cond::Cmpps_Invalid); 3547 T = makeReg(Src0RM->getType()); 3548 _movp(T, Src0RM); 3549 _cmpps(T, Src1RM, Predicate); 3550 } break; 3551 case InstFcmp::One: { 3552 // Check both unequal and ordered. 3553 T = makeReg(Src0RM->getType()); 3554 Variable *T2 = makeReg(Src0RM->getType()); 3555 _movp(T, Src0RM); 3556 _cmpps(T, Src1RM, Traits::Cond::Cmpps_neq); 3557 _movp(T2, Src0RM); 3558 _cmpps(T2, Src1RM, Traits::Cond::Cmpps_ord); 3559 _pand(T, T2); 3560 } break; 3561 case InstFcmp::Ueq: { 3562 // Check both equal or unordered. 3563 T = makeReg(Src0RM->getType()); 3564 Variable *T2 = makeReg(Src0RM->getType()); 3565 _movp(T, Src0RM); 3566 _cmpps(T, Src1RM, Traits::Cond::Cmpps_eq); 3567 _movp(T2, Src0RM); 3568 _cmpps(T2, Src1RM, Traits::Cond::Cmpps_unord); 3569 _por(T, T2); 3570 } break; 3571 } 3572 } 3573 3574 assert(T != nullptr); 3575 _movp(Dest, T); 3576 eliminateNextVectorSextInstruction(Dest); 3577 } 3578 3579 inline bool isZero(const Operand *Opnd) { 3580 if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd)) 3581 return C64->getValue() == 0; 3582 if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd)) 3583 return C32->getValue() == 0; 3584 return false; 3585 } 3586 3587 template <typename TraitsType> 3588 void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp, 3589 const Inst *Consumer) { 3590 Operand *Src0 = legalize(Icmp->getSrc(0)); 3591 Operand *Src1 = legalize(Icmp->getSrc(1)); 3592 Variable *Dest = Icmp->getDest(); 3593 3594 if (isVectorType(Dest->getType())) { 3595 lowerIcmp(Icmp); 3596 if (Consumer != nullptr) 3597 lowerSelectVector(llvm::cast<InstSelect>(Consumer)); 3598 return; 3599 } 3600 3601 if (!Traits::Is64Bit && Src0->getType() == IceType_i64) { 3602 lowerIcmp64(Icmp, Consumer); 3603 return; 3604 } 3605 3606 // cmp b, c 3607 if (isZero(Src1)) { 3608 switch (Icmp->getCondition()) { 3609 default: 3610 break; 3611 case InstIcmp::Uge: 3612 movOrConsumer(true, Dest, Consumer); 3613 return; 3614 case InstIcmp::Ult: 3615 movOrConsumer(false, Dest, Consumer); 3616 return; 3617 } 3618 } 3619 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1); 3620 _cmp(Src0RM, Src1); 3621 setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest, 3622 Consumer); 3623 } 3624 3625 template <typename TraitsType> 3626 void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) { 3627 Operand *Src0 = legalize(Icmp->getSrc(0)); 3628 Operand *Src1 = legalize(Icmp->getSrc(1)); 3629 Variable *Dest = Icmp->getDest(); 3630 3631 if (!isVectorType(Dest->getType())) 3632 llvm::report_fatal_error("Expected a vector compare"); 3633 3634 Type Ty = Src0->getType(); 3635 // Promote i1 vectors to 128 bit integer vector types. 3636 if (typeElementType(Ty) == IceType_i1) { 3637 Type NewTy = IceType_NUM; 3638 switch (Ty) { 3639 default: 3640 llvm::report_fatal_error("unexpected type"); 3641 break; 3642 case IceType_v4i1: 3643 NewTy = IceType_v4i32; 3644 break; 3645 case IceType_v8i1: 3646 NewTy = IceType_v8i16; 3647 break; 3648 case IceType_v16i1: 3649 NewTy = IceType_v16i8; 3650 break; 3651 } 3652 Variable *NewSrc0 = Func->makeVariable(NewTy); 3653 Variable *NewSrc1 = Func->makeVariable(NewTy); 3654 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0)); 3655 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1)); 3656 Src0 = NewSrc0; 3657 Src1 = NewSrc1; 3658 Ty = NewTy; 3659 } 3660 3661 InstIcmp::ICond Condition = Icmp->getCondition(); 3662 3663 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3664 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 3665 3666 // SSE2 only has signed comparison operations. Transform unsigned inputs in 3667 // a manner that allows for the use of signed comparison operations by 3668 // flipping the high order bits. 3669 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge || 3670 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) { 3671 Variable *T0 = makeReg(Ty); 3672 Variable *T1 = makeReg(Ty); 3673 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty); 3674 _movp(T0, Src0RM); 3675 _pxor(T0, HighOrderBits); 3676 _movp(T1, Src1RM); 3677 _pxor(T1, HighOrderBits); 3678 Src0RM = T0; 3679 Src1RM = T1; 3680 } 3681 3682 Variable *T = makeReg(Ty); 3683 switch (Condition) { 3684 default: 3685 llvm_unreachable("unexpected condition"); 3686 break; 3687 case InstIcmp::Eq: { 3688 if (llvm::isa<X86OperandMem>(Src1RM)) 3689 Src1RM = legalizeToReg(Src1RM); 3690 _movp(T, Src0RM); 3691 _pcmpeq(T, Src1RM); 3692 } break; 3693 case InstIcmp::Ne: { 3694 if (llvm::isa<X86OperandMem>(Src1RM)) 3695 Src1RM = legalizeToReg(Src1RM); 3696 _movp(T, Src0RM); 3697 _pcmpeq(T, Src1RM); 3698 Variable *MinusOne = makeVectorOfMinusOnes(Ty); 3699 _pxor(T, MinusOne); 3700 } break; 3701 case InstIcmp::Ugt: 3702 case InstIcmp::Sgt: { 3703 if (llvm::isa<X86OperandMem>(Src1RM)) 3704 Src1RM = legalizeToReg(Src1RM); 3705 _movp(T, Src0RM); 3706 _pcmpgt(T, Src1RM); 3707 } break; 3708 case InstIcmp::Uge: 3709 case InstIcmp::Sge: { 3710 // !(Src1RM > Src0RM) 3711 if (llvm::isa<X86OperandMem>(Src0RM)) 3712 Src0RM = legalizeToReg(Src0RM); 3713 _movp(T, Src1RM); 3714 _pcmpgt(T, Src0RM); 3715 Variable *MinusOne = makeVectorOfMinusOnes(Ty); 3716 _pxor(T, MinusOne); 3717 } break; 3718 case InstIcmp::Ult: 3719 case InstIcmp::Slt: { 3720 if (llvm::isa<X86OperandMem>(Src0RM)) 3721 Src0RM = legalizeToReg(Src0RM); 3722 _movp(T, Src1RM); 3723 _pcmpgt(T, Src0RM); 3724 } break; 3725 case InstIcmp::Ule: 3726 case InstIcmp::Sle: { 3727 // !(Src0RM > Src1RM) 3728 if (llvm::isa<X86OperandMem>(Src1RM)) 3729 Src1RM = legalizeToReg(Src1RM); 3730 _movp(T, Src0RM); 3731 _pcmpgt(T, Src1RM); 3732 Variable *MinusOne = makeVectorOfMinusOnes(Ty); 3733 _pxor(T, MinusOne); 3734 } break; 3735 } 3736 3737 _movp(Dest, T); 3738 eliminateNextVectorSextInstruction(Dest); 3739 } 3740 3741 template <typename TraitsType> 3742 template <typename T> 3743 typename std::enable_if<!T::Is64Bit, void>::type 3744 TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp, 3745 const Inst *Consumer) { 3746 // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1: 3747 Operand *Src0 = legalize(Icmp->getSrc(0)); 3748 Operand *Src1 = legalize(Icmp->getSrc(1)); 3749 Variable *Dest = Icmp->getDest(); 3750 InstIcmp::ICond Condition = Icmp->getCondition(); 3751 assert(static_cast<size_t>(Condition) < Traits::TableIcmp64Size); 3752 Operand *Src0LoRM = nullptr; 3753 Operand *Src0HiRM = nullptr; 3754 // Legalize the portions of Src0 that are going to be needed. 3755 if (isZero(Src1)) { 3756 switch (Condition) { 3757 default: 3758 llvm_unreachable("unexpected condition"); 3759 break; 3760 // These two are not optimized, so we fall through to the general case, 3761 // which needs the upper and lower halves legalized. 3762 case InstIcmp::Sgt: 3763 case InstIcmp::Sle: 3764 // These four compare after performing an "or" of the high and low half, so 3765 // they need the upper and lower halves legalized. 3766 case InstIcmp::Eq: 3767 case InstIcmp::Ule: 3768 case InstIcmp::Ne: 3769 case InstIcmp::Ugt: 3770 Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem); 3771 // These two test only the high half's sign bit, so they need only 3772 // the upper half legalized. 3773 case InstIcmp::Sge: 3774 case InstIcmp::Slt: 3775 Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem); 3776 break; 3777 3778 // These two move constants and hence need no legalization. 3779 case InstIcmp::Uge: 3780 case InstIcmp::Ult: 3781 break; 3782 } 3783 } else { 3784 Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem); 3785 Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem); 3786 } 3787 // Optimize comparisons with zero. 3788 if (isZero(Src1)) { 3789 Constant *SignMask = Ctx->getConstantInt32(0x80000000); 3790 Variable *Temp = nullptr; 3791 switch (Condition) { 3792 default: 3793 llvm_unreachable("unexpected condition"); 3794 break; 3795 case InstIcmp::Eq: 3796 case InstIcmp::Ule: 3797 // Mov Src0HiRM first, because it was legalized most recently, and will 3798 // sometimes avoid a move before the OR. 3799 _mov(Temp, Src0HiRM); 3800 _or(Temp, Src0LoRM); 3801 Context.insert<InstFakeUse>(Temp); 3802 setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer); 3803 return; 3804 case InstIcmp::Ne: 3805 case InstIcmp::Ugt: 3806 // Mov Src0HiRM first, because it was legalized most recently, and will 3807 // sometimes avoid a move before the OR. 3808 _mov(Temp, Src0HiRM); 3809 _or(Temp, Src0LoRM); 3810 Context.insert<InstFakeUse>(Temp); 3811 setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer); 3812 return; 3813 case InstIcmp::Uge: 3814 movOrConsumer(true, Dest, Consumer); 3815 return; 3816 case InstIcmp::Ult: 3817 movOrConsumer(false, Dest, Consumer); 3818 return; 3819 case InstIcmp::Sgt: 3820 break; 3821 case InstIcmp::Sge: 3822 _test(Src0HiRM, SignMask); 3823 setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer); 3824 return; 3825 case InstIcmp::Slt: 3826 _test(Src0HiRM, SignMask); 3827 setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer); 3828 return; 3829 case InstIcmp::Sle: 3830 break; 3831 } 3832 } 3833 // Handle general compares. 3834 Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm); 3835 Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm); 3836 if (Consumer == nullptr) { 3837 Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0); 3838 Constant *One = Ctx->getConstantInt(Dest->getType(), 1); 3839 InstX86Label *LabelFalse = InstX86Label::create(Func, this); 3840 InstX86Label *LabelTrue = InstX86Label::create(Func, this); 3841 _mov(Dest, One); 3842 _cmp(Src0HiRM, Src1HiRI); 3843 if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None) 3844 _br(Traits::TableIcmp64[Condition].C1, LabelTrue); 3845 if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None) 3846 _br(Traits::TableIcmp64[Condition].C2, LabelFalse); 3847 _cmp(Src0LoRM, Src1LoRI); 3848 _br(Traits::TableIcmp64[Condition].C3, LabelTrue); 3849 Context.insert(LabelFalse); 3850 _redefined(_mov(Dest, Zero)); 3851 Context.insert(LabelTrue); 3852 return; 3853 } 3854 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3855 _cmp(Src0HiRM, Src1HiRI); 3856 if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None) 3857 _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue()); 3858 if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None) 3859 _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse()); 3860 _cmp(Src0LoRM, Src1LoRI); 3861 _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(), 3862 Br->getTargetFalse()); 3863 return; 3864 } 3865 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3866 Operand *SrcT = Select->getTrueOperand(); 3867 Operand *SrcF = Select->getFalseOperand(); 3868 Variable *SelectDest = Select->getDest(); 3869 InstX86Label *LabelFalse = InstX86Label::create(Func, this); 3870 InstX86Label *LabelTrue = InstX86Label::create(Func, this); 3871 lowerMove(SelectDest, SrcT, false); 3872 _cmp(Src0HiRM, Src1HiRI); 3873 if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None) 3874 _br(Traits::TableIcmp64[Condition].C1, LabelTrue); 3875 if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None) 3876 _br(Traits::TableIcmp64[Condition].C2, LabelFalse); 3877 _cmp(Src0LoRM, Src1LoRI); 3878 _br(Traits::TableIcmp64[Condition].C3, LabelTrue); 3879 Context.insert(LabelFalse); 3880 static constexpr bool IsRedefinition = true; 3881 lowerMove(SelectDest, SrcF, IsRedefinition); 3882 Context.insert(LabelTrue); 3883 return; 3884 } 3885 llvm::report_fatal_error("Unexpected consumer type"); 3886 } 3887 3888 template <typename TraitsType> 3889 void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition, 3890 Variable *Dest, 3891 const Inst *Consumer) { 3892 if (Consumer == nullptr) { 3893 _setcc(Dest, Condition); 3894 return; 3895 } 3896 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3897 _br(Condition, Br->getTargetTrue(), Br->getTargetFalse()); 3898 return; 3899 } 3900 if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3901 Operand *SrcT = Select->getTrueOperand(); 3902 Operand *SrcF = Select->getFalseOperand(); 3903 Variable *SelectDest = Select->getDest(); 3904 lowerSelectMove(SelectDest, Condition, SrcT, SrcF); 3905 return; 3906 } 3907 llvm::report_fatal_error("Unexpected consumer type"); 3908 } 3909 3910 template <typename TraitsType> 3911 void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest, 3912 const Inst *Consumer) { 3913 if (Consumer == nullptr) { 3914 _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0))); 3915 return; 3916 } 3917 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3918 // TODO(sehr,stichnot): This could be done with a single unconditional 3919 // branch instruction, but subzero doesn't know how to handle the resulting 3920 // control flow graph changes now. Make it do so to eliminate mov and cmp. 3921 _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0))); 3922 _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0)); 3923 _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse()); 3924 return; 3925 } 3926 if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3927 Operand *Src = nullptr; 3928 if (IcmpResult) { 3929 Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm); 3930 } else { 3931 Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm); 3932 } 3933 Variable *SelectDest = Select->getDest(); 3934 lowerMove(SelectDest, Src, false); 3935 return; 3936 } 3937 llvm::report_fatal_error("Unexpected consumer type"); 3938 } 3939 3940 template <typename TraitsType> 3941 void TargetX86Base<TraitsType>::lowerArithAndConsumer( 3942 const InstArithmetic *Arith, const Inst *Consumer) { 3943 Variable *T = nullptr; 3944 Operand *Src0 = legalize(Arith->getSrc(0)); 3945 Operand *Src1 = legalize(Arith->getSrc(1)); 3946 Variable *Dest = Arith->getDest(); 3947 switch (Arith->getOp()) { 3948 default: 3949 llvm_unreachable("arithmetic operator not AND or OR"); 3950 break; 3951 case InstArithmetic::And: 3952 _mov(T, Src0); 3953 // Test cannot have an address in the second position. Since T is 3954 // guaranteed to be a register and Src1 could be a memory load, ensure 3955 // that the second argument is a register. 3956 if (llvm::isa<Constant>(Src1)) 3957 _test(T, Src1); 3958 else 3959 _test(Src1, T); 3960 break; 3961 case InstArithmetic::Or: 3962 _mov(T, Src0); 3963 _or(T, Src1); 3964 break; 3965 } 3966 3967 if (Consumer == nullptr) { 3968 llvm::report_fatal_error("Expected a consumer instruction"); 3969 } 3970 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3971 Context.insert<InstFakeUse>(T); 3972 Context.insert<InstFakeDef>(Dest); 3973 _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse()); 3974 return; 3975 } 3976 llvm::report_fatal_error("Unexpected consumer type"); 3977 } 3978 3979 template <typename TraitsType> 3980 void TargetX86Base<TraitsType>::lowerInsertElement( 3981 const InstInsertElement *Instr) { 3982 Operand *SourceVectNotLegalized = Instr->getSrc(0); 3983 Operand *ElementToInsertNotLegalized = Instr->getSrc(1); 3984 auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2)); 3985 // Only constant indices are allowed in PNaCl IR. 3986 assert(ElementIndex); 3987 unsigned Index = ElementIndex->getValue(); 3988 assert(Index < typeNumElements(SourceVectNotLegalized->getType())); 3989 3990 Type Ty = SourceVectNotLegalized->getType(); 3991 Type ElementTy = typeElementType(Ty); 3992 Type InVectorElementTy = Traits::getInVectorElementType(Ty); 3993 3994 if (ElementTy == IceType_i1) { 3995 // Expand the element to the appropriate size for it to be inserted in the 3996 // vector. 3997 Variable *Expanded = Func->makeVariable(InVectorElementTy); 3998 auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded, 3999 ElementToInsertNotLegalized); 4000 lowerCast(Cast); 4001 ElementToInsertNotLegalized = Expanded; 4002 } 4003 4004 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || 4005 InstructionSet >= Traits::SSE4_1) { 4006 // Use insertps, pinsrb, pinsrw, or pinsrd. 4007 Operand *ElementRM = 4008 legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem); 4009 Operand *SourceVectRM = 4010 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem); 4011 Variable *T = makeReg(Ty); 4012 _movp(T, SourceVectRM); 4013 if (Ty == IceType_v4f32) { 4014 _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4)); 4015 } else { 4016 // For the pinsrb and pinsrw instructions, when the source operand is a 4017 // register, it must be a full r32 register like eax, and not ax/al/ah. 4018 // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for 4019 // the use 4020 // of r16 and r8 by converting them through getBaseReg(), while emitIAS() 4021 // validates that the original and base register encodings are the same. 4022 if (ElementRM->getType() == IceType_i8 && 4023 llvm::isa<Variable>(ElementRM)) { 4024 // Don't use ah/bh/ch/dh for pinsrb. 4025 ElementRM = copyToReg8(ElementRM); 4026 } 4027 _pinsr(T, ElementRM, Ctx->getConstantInt32(Index)); 4028 } 4029 _movp(Instr->getDest(), T); 4030 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { 4031 // Use shufps or movss. 4032 Variable *ElementR = nullptr; 4033 Operand *SourceVectRM = 4034 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem); 4035 4036 if (InVectorElementTy == IceType_f32) { 4037 // ElementR will be in an XMM register since it is floating point. 4038 ElementR = legalizeToReg(ElementToInsertNotLegalized); 4039 } else { 4040 // Copy an integer to an XMM register. 4041 Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem); 4042 ElementR = makeReg(Ty); 4043 _movd(ElementR, T); 4044 } 4045 4046 if (Index == 0) { 4047 Variable *T = makeReg(Ty); 4048 _movp(T, SourceVectRM); 4049 _movss(T, ElementR); 4050 _movp(Instr->getDest(), T); 4051 return; 4052 } 4053 4054 // shufps treats the source and destination operands as vectors of four 4055 // doublewords. The destination's two high doublewords are selected from 4056 // the source operand and the two low doublewords are selected from the 4057 // (original value of) the destination operand. An insertelement operation 4058 // can be effected with a sequence of two shufps operations with 4059 // appropriate masks. In all cases below, Element[0] is being inserted into 4060 // SourceVectOperand. Indices are ordered from left to right. 4061 // 4062 // insertelement into index 1 (result is stored in ElementR): 4063 // ElementR := ElementR[0, 0] SourceVectRM[0, 0] 4064 // ElementR := ElementR[3, 0] SourceVectRM[2, 3] 4065 // 4066 // insertelement into index 2 (result is stored in T): 4067 // T := SourceVectRM 4068 // ElementR := ElementR[0, 0] T[0, 3] 4069 // T := T[0, 1] ElementR[0, 3] 4070 // 4071 // insertelement into index 3 (result is stored in T): 4072 // T := SourceVectRM 4073 // ElementR := ElementR[0, 0] T[0, 2] 4074 // T := T[0, 1] ElementR[3, 0] 4075 const unsigned char Mask1[3] = {0, 192, 128}; 4076 const unsigned char Mask2[3] = {227, 196, 52}; 4077 4078 Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]); 4079 Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]); 4080 4081 if (Index == 1) { 4082 _shufps(ElementR, SourceVectRM, Mask1Constant); 4083 _shufps(ElementR, SourceVectRM, Mask2Constant); 4084 _movp(Instr->getDest(), ElementR); 4085 } else { 4086 Variable *T = makeReg(Ty); 4087 _movp(T, SourceVectRM); 4088 _shufps(ElementR, T, Mask1Constant); 4089 _shufps(T, ElementR, Mask2Constant); 4090 _movp(Instr->getDest(), T); 4091 } 4092 } else { 4093 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); 4094 // Spill the value to a stack slot and perform the insertion in memory. 4095 // 4096 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support 4097 // for legalizing to mem is implemented. 4098 Variable *Slot = Func->makeVariable(Ty); 4099 Slot->setMustNotHaveReg(); 4100 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); 4101 4102 // Compute the location of the position to insert in memory. 4103 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); 4104 X86OperandMem *Loc = 4105 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); 4106 _store(legalizeToReg(ElementToInsertNotLegalized), Loc); 4107 4108 Variable *T = makeReg(Ty); 4109 _movp(T, Slot); 4110 _movp(Instr->getDest(), T); 4111 } 4112 } 4113 4114 template <typename TraitsType> 4115 void TargetX86Base<TraitsType>::lowerIntrinsic(const InstIntrinsic *Instr) { 4116 switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) { 4117 case Intrinsics::AtomicCmpxchg: { 4118 if (!Intrinsics::isMemoryOrderValid( 4119 ID, getConstantMemoryOrder(Instr->getArg(3)), 4120 getConstantMemoryOrder(Instr->getArg(4)))) { 4121 Func->setError("Unexpected memory ordering for AtomicCmpxchg"); 4122 return; 4123 } 4124 Variable *DestPrev = Instr->getDest(); 4125 Operand *PtrToMem = legalize(Instr->getArg(0)); 4126 Operand *Expected = legalize(Instr->getArg(1)); 4127 Operand *Desired = legalize(Instr->getArg(2)); 4128 if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired)) 4129 return; 4130 lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired); 4131 return; 4132 } 4133 case Intrinsics::AtomicFence: 4134 if (!Intrinsics::isMemoryOrderValid( 4135 ID, getConstantMemoryOrder(Instr->getArg(0)))) { 4136 Func->setError("Unexpected memory ordering for AtomicFence"); 4137 return; 4138 } 4139 _mfence(); 4140 return; 4141 case Intrinsics::AtomicFenceAll: 4142 // NOTE: FenceAll should prevent and load/store from being moved across the 4143 // fence (both atomic and non-atomic). The InstX8632Mfence instruction is 4144 // currently marked coarsely as "HasSideEffects". 4145 _mfence(); 4146 return; 4147 case Intrinsics::AtomicIsLockFree: { 4148 // X86 is always lock free for 8/16/32/64 bit accesses. 4149 // TODO(jvoung): Since the result is constant when given a constant byte 4150 // size, this opens up DCE opportunities. 4151 Operand *ByteSize = Instr->getArg(0); 4152 Variable *Dest = Instr->getDest(); 4153 if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) { 4154 Constant *Result; 4155 switch (CI->getValue()) { 4156 default: 4157 // Some x86-64 processors support the cmpxchg16b instruction, which can 4158 // make 16-byte operations lock free (when used with the LOCK prefix). 4159 // However, that's not supported in 32-bit mode, so just return 0 even 4160 // for large sizes. 4161 Result = Ctx->getConstantZero(IceType_i32); 4162 break; 4163 case 1: 4164 case 2: 4165 case 4: 4166 case 8: 4167 Result = Ctx->getConstantInt32(1); 4168 break; 4169 } 4170 _mov(Dest, Result); 4171 return; 4172 } 4173 // The PNaCl ABI requires the byte size to be a compile-time constant. 4174 Func->setError("AtomicIsLockFree byte size should be compile-time const"); 4175 return; 4176 } 4177 case Intrinsics::AtomicLoad: { 4178 // We require the memory address to be naturally aligned. Given that is the 4179 // case, then normal loads are atomic. 4180 if (!Intrinsics::isMemoryOrderValid( 4181 ID, getConstantMemoryOrder(Instr->getArg(1)))) { 4182 Func->setError("Unexpected memory ordering for AtomicLoad"); 4183 return; 4184 } 4185 Variable *Dest = Instr->getDest(); 4186 if (!Traits::Is64Bit) { 4187 if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) { 4188 // Follow what GCC does and use a movq instead of what lowerLoad() 4189 // normally does (split the load into two). Thus, this skips 4190 // load/arithmetic op folding. Load/arithmetic folding can't happen 4191 // anyway, since this is x86-32 and integer arithmetic only happens on 4192 // 32-bit quantities. 4193 Variable *T = makeReg(IceType_f64); 4194 X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64); 4195 _movq(T, Addr); 4196 // Then cast the bits back out of the XMM register to the i64 Dest. 4197 auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T); 4198 lowerCast(Cast); 4199 // Make sure that the atomic load isn't elided when unused. 4200 Context.insert<InstFakeUse>(Dest64On32->getLo()); 4201 Context.insert<InstFakeUse>(Dest64On32->getHi()); 4202 return; 4203 } 4204 } 4205 auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0)); 4206 lowerLoad(Load); 4207 // Make sure the atomic load isn't elided when unused, by adding a FakeUse. 4208 // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert 4209 // the FakeUse on the last-inserted instruction's dest. 4210 Context.insert<InstFakeUse>(Context.getLastInserted()->getDest()); 4211 return; 4212 } 4213 case Intrinsics::AtomicRMW: 4214 if (!Intrinsics::isMemoryOrderValid( 4215 ID, getConstantMemoryOrder(Instr->getArg(3)))) { 4216 Func->setError("Unexpected memory ordering for AtomicRMW"); 4217 return; 4218 } 4219 lowerAtomicRMW( 4220 Instr->getDest(), 4221 static_cast<uint32_t>( 4222 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()), 4223 Instr->getArg(1), Instr->getArg(2)); 4224 return; 4225 case Intrinsics::AtomicStore: { 4226 if (!Intrinsics::isMemoryOrderValid( 4227 ID, getConstantMemoryOrder(Instr->getArg(2)))) { 4228 Func->setError("Unexpected memory ordering for AtomicStore"); 4229 return; 4230 } 4231 // We require the memory address to be naturally aligned. Given that is the 4232 // case, then normal stores are atomic. Add a fence after the store to make 4233 // it visible. 4234 Operand *Value = Instr->getArg(0); 4235 Operand *Ptr = Instr->getArg(1); 4236 if (!Traits::Is64Bit && Value->getType() == IceType_i64) { 4237 // Use a movq instead of what lowerStore() normally does (split the store 4238 // into two), following what GCC does. Cast the bits from int -> to an 4239 // xmm register first. 4240 Variable *T = makeReg(IceType_f64); 4241 auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value); 4242 lowerCast(Cast); 4243 // Then store XMM w/ a movq. 4244 X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64); 4245 _storeq(T, Addr); 4246 _mfence(); 4247 return; 4248 } 4249 auto *Store = InstStore::create(Func, Value, Ptr); 4250 lowerStore(Store); 4251 _mfence(); 4252 return; 4253 } 4254 case Intrinsics::Bswap: { 4255 Variable *Dest = Instr->getDest(); 4256 Operand *Val = Instr->getArg(0); 4257 // In 32-bit mode, bswap only works on 32-bit arguments, and the argument 4258 // must be a register. Use rotate left for 16-bit bswap. 4259 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { 4260 Val = legalizeUndef(Val); 4261 Variable *T_Lo = legalizeToReg(loOperand(Val)); 4262 Variable *T_Hi = legalizeToReg(hiOperand(Val)); 4263 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 4264 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 4265 _bswap(T_Lo); 4266 _bswap(T_Hi); 4267 _mov(DestLo, T_Hi); 4268 _mov(DestHi, T_Lo); 4269 } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) || 4270 Val->getType() == IceType_i32) { 4271 Variable *T = legalizeToReg(Val); 4272 _bswap(T); 4273 _mov(Dest, T); 4274 } else { 4275 assert(Val->getType() == IceType_i16); 4276 Constant *Eight = Ctx->getConstantInt16(8); 4277 Variable *T = nullptr; 4278 Val = legalize(Val); 4279 _mov(T, Val); 4280 _rol(T, Eight); 4281 _mov(Dest, T); 4282 } 4283 return; 4284 } 4285 case Intrinsics::Ctpop: { 4286 Variable *Dest = Instr->getDest(); 4287 Variable *T = nullptr; 4288 Operand *Val = Instr->getArg(0); 4289 Type ValTy = Val->getType(); 4290 assert(ValTy == IceType_i32 || ValTy == IceType_i64); 4291 4292 if (!Traits::Is64Bit) { 4293 T = Dest; 4294 } else { 4295 T = makeReg(IceType_i64); 4296 if (ValTy == IceType_i32) { 4297 // in x86-64, __popcountsi2 is not defined, so we cheat a bit by 4298 // converting it to a 64-bit value, and using ctpop_i64. _movzx should 4299 // ensure we will not have any bits set on Val's upper 32 bits. 4300 Variable *V = makeReg(IceType_i64); 4301 Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem); 4302 _movzx(V, ValRM); 4303 Val = V; 4304 } 4305 ValTy = IceType_i64; 4306 } 4307 4308 InstCall *Call = 4309 makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32 4310 : RuntimeHelper::H_call_ctpop_i64, 4311 T, 1); 4312 Call->addArg(Val); 4313 lowerCall(Call); 4314 // The popcount helpers always return 32-bit values, while the intrinsic's 4315 // signature matches the native POPCNT instruction and fills a 64-bit reg 4316 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case 4317 // the user doesn't do that in the IR. If the user does that in the IR, 4318 // then this zero'ing instruction is dead and gets optimized out. 4319 if (!Traits::Is64Bit) { 4320 assert(T == Dest); 4321 if (Val->getType() == IceType_i64) { 4322 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 4323 Constant *Zero = Ctx->getConstantZero(IceType_i32); 4324 _mov(DestHi, Zero); 4325 } 4326 } else { 4327 assert(Val->getType() == IceType_i64); 4328 // T is 64 bit. It needs to be copied to dest. We need to: 4329 // 4330 // T_1.32 = trunc T.64 to i32 4331 // T_2.64 = zext T_1.32 to i64 4332 // Dest.<<right_size>> = T_2.<<right_size>> 4333 // 4334 // which ensures the upper 32 bits will always be cleared. Just doing a 4335 // 4336 // mov Dest.32 = trunc T.32 to i32 4337 // 4338 // is dangerous because there's a chance the compiler will optimize this 4339 // copy out. To use _movzx we need two new registers (one 32-, and 4340 // another 64-bit wide.) 4341 Variable *T_1 = makeReg(IceType_i32); 4342 _mov(T_1, T); 4343 Variable *T_2 = makeReg(IceType_i64); 4344 _movzx(T_2, T_1); 4345 _mov(Dest, T_2); 4346 } 4347 return; 4348 } 4349 case Intrinsics::Ctlz: { 4350 // The "is zero undef" parameter is ignored and we always return a 4351 // well-defined value. 4352 Operand *Val = legalize(Instr->getArg(0)); 4353 Operand *FirstVal; 4354 Operand *SecondVal = nullptr; 4355 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { 4356 FirstVal = loOperand(Val); 4357 SecondVal = hiOperand(Val); 4358 } else { 4359 FirstVal = Val; 4360 } 4361 constexpr bool IsCttz = false; 4362 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, 4363 SecondVal); 4364 return; 4365 } 4366 case Intrinsics::Cttz: { 4367 // The "is zero undef" parameter is ignored and we always return a 4368 // well-defined value. 4369 Operand *Val = legalize(Instr->getArg(0)); 4370 Operand *FirstVal; 4371 Operand *SecondVal = nullptr; 4372 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { 4373 FirstVal = hiOperand(Val); 4374 SecondVal = loOperand(Val); 4375 } else { 4376 FirstVal = Val; 4377 } 4378 constexpr bool IsCttz = true; 4379 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, 4380 SecondVal); 4381 return; 4382 } 4383 case Intrinsics::Fabs: { 4384 Operand *Src = legalize(Instr->getArg(0)); 4385 Type Ty = Src->getType(); 4386 Variable *Dest = Instr->getDest(); 4387 Variable *T = makeVectorOfFabsMask(Ty); 4388 // The pand instruction operates on an m128 memory operand, so if Src is an 4389 // f32 or f64, we need to make sure it's in a register. 4390 if (isVectorType(Ty)) { 4391 if (llvm::isa<X86OperandMem>(Src)) 4392 Src = legalizeToReg(Src); 4393 } else { 4394 Src = legalizeToReg(Src); 4395 } 4396 _pand(T, Src); 4397 if (isVectorType(Ty)) 4398 _movp(Dest, T); 4399 else 4400 _mov(Dest, T); 4401 return; 4402 } 4403 case Intrinsics::Longjmp: { 4404 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2); 4405 Call->addArg(Instr->getArg(0)); 4406 Call->addArg(Instr->getArg(1)); 4407 lowerCall(Call); 4408 return; 4409 } 4410 case Intrinsics::Memcpy: { 4411 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 4412 return; 4413 } 4414 case Intrinsics::Memmove: { 4415 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 4416 return; 4417 } 4418 case Intrinsics::Memset: { 4419 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 4420 return; 4421 } 4422 case Intrinsics::NaClReadTP: { 4423 if (NeedSandboxing) { 4424 Operand *Src = 4425 dispatchToConcrete(&ConcreteTarget::createNaClReadTPSrcOperand); 4426 Variable *Dest = Instr->getDest(); 4427 Variable *T = nullptr; 4428 _mov(T, Src); 4429 _mov(Dest, T); 4430 } else { 4431 InstCall *Call = 4432 makeHelperCall(RuntimeHelper::H_call_read_tp, Instr->getDest(), 0); 4433 lowerCall(Call); 4434 } 4435 return; 4436 } 4437 case Intrinsics::Setjmp: { 4438 InstCall *Call = 4439 makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1); 4440 Call->addArg(Instr->getArg(0)); 4441 lowerCall(Call); 4442 return; 4443 } 4444 case Intrinsics::Sqrt: { 4445 assert(isScalarFloatingType(Instr->getDest()->getType()) || 4446 getFlags().getApplicationBinaryInterface() != ::Ice::ABI_PNaCl); 4447 Operand *Src = legalize(Instr->getArg(0)); 4448 Variable *Dest = Instr->getDest(); 4449 Variable *T = makeReg(Dest->getType()); 4450 _sqrt(T, Src); 4451 if (isVectorType(Dest->getType())) { 4452 _movp(Dest, T); 4453 } else { 4454 _mov(Dest, T); 4455 } 4456 return; 4457 } 4458 case Intrinsics::Stacksave: { 4459 if (!Traits::Is64Bit || !NeedSandboxing) { 4460 Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg(), 4461 Traits::WordType); 4462 Variable *Dest = Instr->getDest(); 4463 _mov(Dest, esp); 4464 return; 4465 } 4466 Variable *esp = Func->getTarget()->getPhysicalRegister( 4467 Traits::RegisterSet::Reg_esp, IceType_i32); 4468 Variable *Dest = Instr->getDest(); 4469 _mov(Dest, esp); 4470 4471 return; 4472 } 4473 case Intrinsics::Stackrestore: { 4474 Operand *Src = Instr->getArg(0); 4475 _mov_sp(Src); 4476 return; 4477 } 4478 4479 case Intrinsics::Trap: 4480 _ud2(); 4481 return; 4482 case Intrinsics::LoadSubVector: { 4483 assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) && 4484 "LoadSubVector second argument must be a constant"); 4485 Variable *Dest = Instr->getDest(); 4486 Type Ty = Dest->getType(); 4487 auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1)); 4488 Operand *Addr = Instr->getArg(0); 4489 X86OperandMem *Src = formMemoryOperand(Addr, Ty); 4490 doMockBoundsCheck(Src); 4491 4492 if (Dest->isRematerializable()) { 4493 Context.insert<InstFakeDef>(Dest); 4494 return; 4495 } 4496 4497 auto *T = makeReg(Ty); 4498 switch (SubVectorSize->getValue()) { 4499 case 4: 4500 _movd(T, Src); 4501 break; 4502 case 8: 4503 _movq(T, Src); 4504 break; 4505 default: 4506 Func->setError("Unexpected size for LoadSubVector"); 4507 return; 4508 } 4509 _movp(Dest, T); 4510 return; 4511 } 4512 case Intrinsics::StoreSubVector: { 4513 assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) && 4514 "StoreSubVector third argument must be a constant"); 4515 auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2)); 4516 Operand *Value = Instr->getArg(0); 4517 Operand *Addr = Instr->getArg(1); 4518 X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType()); 4519 doMockBoundsCheck(NewAddr); 4520 4521 Value = legalizeToReg(Value); 4522 4523 switch (SubVectorSize->getValue()) { 4524 case 4: 4525 _stored(Value, NewAddr); 4526 break; 4527 case 8: 4528 _storeq(Value, NewAddr); 4529 break; 4530 default: 4531 Func->setError("Unexpected size for StoreSubVector"); 4532 return; 4533 } 4534 return; 4535 } 4536 case Intrinsics::VectorPackSigned: { 4537 Operand *Src0 = Instr->getArg(0); 4538 Operand *Src1 = Instr->getArg(1); 4539 Variable *Dest = Instr->getDest(); 4540 auto *T = makeReg(Src0->getType()); 4541 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4542 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4543 _movp(T, Src0RM); 4544 _packss(T, Src1RM); 4545 _movp(Dest, T); 4546 return; 4547 } 4548 case Intrinsics::VectorPackUnsigned: { 4549 Operand *Src0 = Instr->getArg(0); 4550 Operand *Src1 = Instr->getArg(1); 4551 Variable *Dest = Instr->getDest(); 4552 auto *T = makeReg(Src0->getType()); 4553 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4554 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4555 _movp(T, Src0RM); 4556 _packus(T, Src1RM); 4557 _movp(Dest, T); 4558 return; 4559 } 4560 case Intrinsics::SignMask: { 4561 Operand *SrcReg = legalizeToReg(Instr->getArg(0)); 4562 Variable *Dest = Instr->getDest(); 4563 Variable *T = makeReg(IceType_i32); 4564 if (SrcReg->getType() == IceType_v4f32 || 4565 SrcReg->getType() == IceType_v4i32 || 4566 SrcReg->getType() == IceType_v16i8) { 4567 _movmsk(T, SrcReg); 4568 } else { 4569 // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb 4570 llvm::report_fatal_error("Invalid type for SignMask intrinsic"); 4571 } 4572 _mov(Dest, T); 4573 return; 4574 } 4575 case Intrinsics::MultiplyHighSigned: { 4576 Operand *Src0 = Instr->getArg(0); 4577 Operand *Src1 = Instr->getArg(1); 4578 Variable *Dest = Instr->getDest(); 4579 auto *T = makeReg(Dest->getType()); 4580 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4581 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4582 _movp(T, Src0RM); 4583 _pmulhw(T, Src1RM); 4584 _movp(Dest, T); 4585 return; 4586 } 4587 case Intrinsics::MultiplyHighUnsigned: { 4588 Operand *Src0 = Instr->getArg(0); 4589 Operand *Src1 = Instr->getArg(1); 4590 Variable *Dest = Instr->getDest(); 4591 auto *T = makeReg(Dest->getType()); 4592 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4593 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4594 _movp(T, Src0RM); 4595 _pmulhuw(T, Src1RM); 4596 _movp(Dest, T); 4597 return; 4598 } 4599 case Intrinsics::MultiplyAddPairs: { 4600 Operand *Src0 = Instr->getArg(0); 4601 Operand *Src1 = Instr->getArg(1); 4602 Variable *Dest = Instr->getDest(); 4603 auto *T = makeReg(Dest->getType()); 4604 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4605 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4606 _movp(T, Src0RM); 4607 _pmaddwd(T, Src1RM); 4608 _movp(Dest, T); 4609 return; 4610 } 4611 case Intrinsics::AddSaturateSigned: { 4612 Operand *Src0 = Instr->getArg(0); 4613 Operand *Src1 = Instr->getArg(1); 4614 Variable *Dest = Instr->getDest(); 4615 auto *T = makeReg(Dest->getType()); 4616 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4617 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4618 _movp(T, Src0RM); 4619 _padds(T, Src1RM); 4620 _movp(Dest, T); 4621 return; 4622 } 4623 case Intrinsics::SubtractSaturateSigned: { 4624 Operand *Src0 = Instr->getArg(0); 4625 Operand *Src1 = Instr->getArg(1); 4626 Variable *Dest = Instr->getDest(); 4627 auto *T = makeReg(Dest->getType()); 4628 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4629 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4630 _movp(T, Src0RM); 4631 _psubs(T, Src1RM); 4632 _movp(Dest, T); 4633 return; 4634 } 4635 case Intrinsics::AddSaturateUnsigned: { 4636 Operand *Src0 = Instr->getArg(0); 4637 Operand *Src1 = Instr->getArg(1); 4638 Variable *Dest = Instr->getDest(); 4639 auto *T = makeReg(Dest->getType()); 4640 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4641 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4642 _movp(T, Src0RM); 4643 _paddus(T, Src1RM); 4644 _movp(Dest, T); 4645 return; 4646 } 4647 case Intrinsics::SubtractSaturateUnsigned: { 4648 Operand *Src0 = Instr->getArg(0); 4649 Operand *Src1 = Instr->getArg(1); 4650 Variable *Dest = Instr->getDest(); 4651 auto *T = makeReg(Dest->getType()); 4652 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4653 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4654 _movp(T, Src0RM); 4655 _psubus(T, Src1RM); 4656 _movp(Dest, T); 4657 return; 4658 } 4659 case Intrinsics::Nearbyint: { 4660 Operand *Src = Instr->getArg(0); 4661 Variable *Dest = Instr->getDest(); 4662 Type DestTy = Dest->getType(); 4663 if (isVectorType(DestTy)) { 4664 assert(DestTy == IceType_v4i32); 4665 assert(Src->getType() == IceType_v4f32); 4666 Operand *Src0R = legalizeToReg(Src); 4667 Variable *T = makeReg(DestTy); 4668 _cvt(T, Src0R, Traits::Insts::Cvt::Ps2dq); 4669 _movp(Dest, T); 4670 } else if (!Traits::Is64Bit && DestTy == IceType_i64) { 4671 llvm::report_fatal_error("Helper call was expected"); 4672 } else { 4673 Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem); 4674 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type 4675 Variable *T_1 = nullptr; 4676 if (Traits::Is64Bit && DestTy == IceType_i64) { 4677 T_1 = makeReg(IceType_i64); 4678 } else { 4679 assert(DestTy != IceType_i64); 4680 T_1 = makeReg(IceType_i32); 4681 } 4682 // cvt() requires its integer argument to be a GPR. 4683 Variable *T_2 = makeReg(DestTy); 4684 if (isByteSizedType(DestTy)) { 4685 assert(T_1->getType() == IceType_i32); 4686 T_1->setRegClass(RCX86_Is32To8); 4687 T_2->setRegClass(RCX86_IsTrunc8Rcvr); 4688 } 4689 _cvt(T_1, Src0RM, Traits::Insts::Cvt::Ss2si); 4690 _mov(T_2, T_1); // T_1 and T_2 may have different integer types 4691 if (DestTy == IceType_i1) 4692 _and(T_2, Ctx->getConstantInt1(1)); 4693 _mov(Dest, T_2); 4694 } 4695 return; 4696 } 4697 case Intrinsics::Round: { 4698 assert(InstructionSet >= Traits::SSE4_1); 4699 Variable *Dest = Instr->getDest(); 4700 Operand *Src = Instr->getArg(0); 4701 Operand *Mode = Instr->getArg(1); 4702 assert(llvm::isa<ConstantInteger32>(Mode) && 4703 "Round last argument must be a constant"); 4704 auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem); 4705 int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue(); 4706 (void)Imm; 4707 assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode"); 4708 auto *T = makeReg(Dest->getType()); 4709 _round(T, SrcRM, Mode); 4710 _movp(Dest, T); 4711 return; 4712 } 4713 default: // UnknownIntrinsic 4714 Func->setError("Unexpected intrinsic"); 4715 return; 4716 } 4717 return; 4718 } 4719 4720 template <typename TraitsType> 4721 void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev, 4722 Operand *Ptr, 4723 Operand *Expected, 4724 Operand *Desired) { 4725 Type Ty = Expected->getType(); 4726 if (!Traits::Is64Bit && Ty == IceType_i64) { 4727 // Reserve the pre-colored registers first, before adding any more 4728 // infinite-weight variables from formMemoryOperand's legalization. 4729 Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx); 4730 Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax); 4731 Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx); 4732 Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx); 4733 _mov(T_eax, loOperand(Expected)); 4734 _mov(T_edx, hiOperand(Expected)); 4735 _mov(T_ebx, loOperand(Desired)); 4736 _mov(T_ecx, hiOperand(Desired)); 4737 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty); 4738 constexpr bool Locked = true; 4739 _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked); 4740 auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev)); 4741 auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev)); 4742 _mov(DestLo, T_eax); 4743 _mov(DestHi, T_edx); 4744 return; 4745 } 4746 RegNumT Eax; 4747 switch (Ty) { 4748 default: 4749 llvm::report_fatal_error("Bad type for cmpxchg"); 4750 case IceType_i64: 4751 Eax = Traits::getRaxOrDie(); 4752 break; 4753 case IceType_i32: 4754 Eax = Traits::RegisterSet::Reg_eax; 4755 break; 4756 case IceType_i16: 4757 Eax = Traits::RegisterSet::Reg_ax; 4758 break; 4759 case IceType_i8: 4760 Eax = Traits::RegisterSet::Reg_al; 4761 break; 4762 } 4763 Variable *T_eax = makeReg(Ty, Eax); 4764 _mov(T_eax, Expected); 4765 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty); 4766 Variable *DesiredReg = legalizeToReg(Desired); 4767 constexpr bool Locked = true; 4768 _cmpxchg(Addr, T_eax, DesiredReg, Locked); 4769 _mov(DestPrev, T_eax); 4770 } 4771 4772 template <typename TraitsType> 4773 bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest, 4774 Operand *PtrToMem, 4775 Operand *Expected, 4776 Operand *Desired) { 4777 if (Func->getOptLevel() == Opt_m1) 4778 return false; 4779 // Peek ahead a few instructions and see how Dest is used. 4780 // It's very common to have: 4781 // 4782 // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...) 4783 // [%y_phi = ...] // list of phi stores 4784 // %p = icmp eq i32 %x, %expected 4785 // br i1 %p, label %l1, label %l2 4786 // 4787 // which we can optimize into: 4788 // 4789 // %x = <cmpxchg code> 4790 // [%y_phi = ...] // list of phi stores 4791 // br eq, %l1, %l2 4792 InstList::iterator I = Context.getCur(); 4793 // I is currently the InstIntrinsic. Peek past that. 4794 // This assumes that the atomic cmpxchg has not been lowered yet, 4795 // so that the instructions seen in the scan from "Cur" is simple. 4796 assert(llvm::isa<InstIntrinsic>(*I)); 4797 Inst *NextInst = Context.getNextInst(I); 4798 if (!NextInst) 4799 return false; 4800 // There might be phi assignments right before the compare+branch, since this 4801 // could be a backward branch for a loop. This placement of assignments is 4802 // determined by placePhiStores(). 4803 CfgVector<InstAssign *> PhiAssigns; 4804 while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) { 4805 if (PhiAssign->getDest() == Dest) 4806 return false; 4807 PhiAssigns.push_back(PhiAssign); 4808 NextInst = Context.getNextInst(I); 4809 if (!NextInst) 4810 return false; 4811 } 4812 if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) { 4813 if (!(NextCmp->getCondition() == InstIcmp::Eq && 4814 ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) || 4815 (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) { 4816 return false; 4817 } 4818 NextInst = Context.getNextInst(I); 4819 if (!NextInst) 4820 return false; 4821 if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) { 4822 if (!NextBr->isUnconditional() && 4823 NextCmp->getDest() == NextBr->getCondition() && 4824 NextBr->isLastUse(NextCmp->getDest())) { 4825 lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired); 4826 for (size_t i = 0; i < PhiAssigns.size(); ++i) { 4827 // Lower the phi assignments now, before the branch (same placement 4828 // as before). 4829 InstAssign *PhiAssign = PhiAssigns[i]; 4830 PhiAssign->setDeleted(); 4831 lowerAssign(PhiAssign); 4832 Context.advanceNext(); 4833 } 4834 _br(Traits::Cond::Br_e, NextBr->getTargetTrue(), 4835 NextBr->getTargetFalse()); 4836 // Skip over the old compare and branch, by deleting them. 4837 NextCmp->setDeleted(); 4838 NextBr->setDeleted(); 4839 Context.advanceNext(); 4840 Context.advanceNext(); 4841 return true; 4842 } 4843 } 4844 } 4845 return false; 4846 } 4847 4848 template <typename TraitsType> 4849 void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest, 4850 uint32_t Operation, Operand *Ptr, 4851 Operand *Val) { 4852 bool NeedsCmpxchg = false; 4853 LowerBinOp Op_Lo = nullptr; 4854 LowerBinOp Op_Hi = nullptr; 4855 switch (Operation) { 4856 default: 4857 Func->setError("Unknown AtomicRMW operation"); 4858 return; 4859 case Intrinsics::AtomicAdd: { 4860 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { 4861 // All the fall-through paths must set this to true, but use this 4862 // for asserting. 4863 NeedsCmpxchg = true; 4864 Op_Lo = &TargetX86Base<TraitsType>::_add; 4865 Op_Hi = &TargetX86Base<TraitsType>::_adc; 4866 break; 4867 } 4868 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType()); 4869 constexpr bool Locked = true; 4870 Variable *T = nullptr; 4871 _mov(T, Val); 4872 _xadd(Addr, T, Locked); 4873 _mov(Dest, T); 4874 return; 4875 } 4876 case Intrinsics::AtomicSub: { 4877 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { 4878 NeedsCmpxchg = true; 4879 Op_Lo = &TargetX86Base<TraitsType>::_sub; 4880 Op_Hi = &TargetX86Base<TraitsType>::_sbb; 4881 break; 4882 } 4883 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType()); 4884 constexpr bool Locked = true; 4885 Variable *T = nullptr; 4886 _mov(T, Val); 4887 _neg(T); 4888 _xadd(Addr, T, Locked); 4889 _mov(Dest, T); 4890 return; 4891 } 4892 case Intrinsics::AtomicOr: 4893 // TODO(jvoung): If Dest is null or dead, then some of these 4894 // operations do not need an "exchange", but just a locked op. 4895 // That appears to be "worth" it for sub, or, and, and xor. 4896 // xadd is probably fine vs lock add for add, and xchg is fine 4897 // vs an atomic store. 4898 NeedsCmpxchg = true; 4899 Op_Lo = &TargetX86Base<TraitsType>::_or; 4900 Op_Hi = &TargetX86Base<TraitsType>::_or; 4901 break; 4902 case Intrinsics::AtomicAnd: 4903 NeedsCmpxchg = true; 4904 Op_Lo = &TargetX86Base<TraitsType>::_and; 4905 Op_Hi = &TargetX86Base<TraitsType>::_and; 4906 break; 4907 case Intrinsics::AtomicXor: 4908 NeedsCmpxchg = true; 4909 Op_Lo = &TargetX86Base<TraitsType>::_xor; 4910 Op_Hi = &TargetX86Base<TraitsType>::_xor; 4911 break; 4912 case Intrinsics::AtomicExchange: 4913 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { 4914 NeedsCmpxchg = true; 4915 // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values 4916 // just need to be moved to the ecx and ebx registers. 4917 Op_Lo = nullptr; 4918 Op_Hi = nullptr; 4919 break; 4920 } 4921 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType()); 4922 Variable *T = nullptr; 4923 _mov(T, Val); 4924 _xchg(Addr, T); 4925 _mov(Dest, T); 4926 return; 4927 } 4928 // Otherwise, we need a cmpxchg loop. 4929 (void)NeedsCmpxchg; 4930 assert(NeedsCmpxchg); 4931 expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val); 4932 } 4933 4934 template <typename TraitsType> 4935 void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, 4936 LowerBinOp Op_Hi, 4937 Variable *Dest, 4938 Operand *Ptr, 4939 Operand *Val) { 4940 // Expand a more complex RMW operation as a cmpxchg loop: 4941 // For 64-bit: 4942 // mov eax, [ptr] 4943 // mov edx, [ptr + 4] 4944 // .LABEL: 4945 // mov ebx, eax 4946 // <Op_Lo> ebx, <desired_adj_lo> 4947 // mov ecx, edx 4948 // <Op_Hi> ecx, <desired_adj_hi> 4949 // lock cmpxchg8b [ptr] 4950 // jne .LABEL 4951 // mov <dest_lo>, eax 4952 // mov <dest_lo>, edx 4953 // 4954 // For 32-bit: 4955 // mov eax, [ptr] 4956 // .LABEL: 4957 // mov <reg>, eax 4958 // op <reg>, [desired_adj] 4959 // lock cmpxchg [ptr], <reg> 4960 // jne .LABEL 4961 // mov <dest>, eax 4962 // 4963 // If Op_{Lo,Hi} are nullptr, then just copy the value. 4964 Val = legalize(Val); 4965 Type Ty = Val->getType(); 4966 if (!Traits::Is64Bit && Ty == IceType_i64) { 4967 Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx); 4968 Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax); 4969 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty); 4970 _mov(T_eax, loOperand(Addr)); 4971 _mov(T_edx, hiOperand(Addr)); 4972 Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx); 4973 Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx); 4974 InstX86Label *Label = InstX86Label::create(Func, this); 4975 const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr; 4976 if (!IsXchg8b) { 4977 Context.insert(Label); 4978 _mov(T_ebx, T_eax); 4979 (this->*Op_Lo)(T_ebx, loOperand(Val)); 4980 _mov(T_ecx, T_edx); 4981 (this->*Op_Hi)(T_ecx, hiOperand(Val)); 4982 } else { 4983 // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi. 4984 // It just needs the Val loaded into ebx and ecx. 4985 // That can also be done before the loop. 4986 _mov(T_ebx, loOperand(Val)); 4987 _mov(T_ecx, hiOperand(Val)); 4988 Context.insert(Label); 4989 } 4990 constexpr bool Locked = true; 4991 _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked); 4992 _br(Traits::Cond::Br_ne, Label); 4993 if (!IsXchg8b) { 4994 // If Val is a variable, model the extended live range of Val through 4995 // the end of the loop, since it will be re-used by the loop. 4996 if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) { 4997 auto *ValLo = llvm::cast<Variable>(loOperand(ValVar)); 4998 auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar)); 4999 Context.insert<InstFakeUse>(ValLo); 5000 Context.insert<InstFakeUse>(ValHi); 5001 } 5002 } else { 5003 // For xchg, the loop is slightly smaller and ebx/ecx are used. 5004 Context.insert<InstFakeUse>(T_ebx); 5005 Context.insert<InstFakeUse>(T_ecx); 5006 } 5007 // The address base (if any) is also reused in the loop. 5008 if (Variable *Base = Addr->getBase()) 5009 Context.insert<InstFakeUse>(Base); 5010 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 5011 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 5012 _mov(DestLo, T_eax); 5013 _mov(DestHi, T_edx); 5014 return; 5015 } 5016 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty); 5017 RegNumT Eax; 5018 switch (Ty) { 5019 default: 5020 llvm::report_fatal_error("Bad type for atomicRMW"); 5021 case IceType_i64: 5022 Eax = Traits::getRaxOrDie(); 5023 break; 5024 case IceType_i32: 5025 Eax = Traits::RegisterSet::Reg_eax; 5026 break; 5027 case IceType_i16: 5028 Eax = Traits::RegisterSet::Reg_ax; 5029 break; 5030 case IceType_i8: 5031 Eax = Traits::RegisterSet::Reg_al; 5032 break; 5033 } 5034 Variable *T_eax = makeReg(Ty, Eax); 5035 _mov(T_eax, Addr); 5036 auto *Label = Context.insert<InstX86Label>(this); 5037 // We want to pick a different register for T than Eax, so don't use 5038 // _mov(T == nullptr, T_eax). 5039 Variable *T = makeReg(Ty); 5040 _mov(T, T_eax); 5041 (this->*Op_Lo)(T, Val); 5042 constexpr bool Locked = true; 5043 _cmpxchg(Addr, T_eax, T, Locked); 5044 _br(Traits::Cond::Br_ne, Label); 5045 // If Val is a variable, model the extended live range of Val through 5046 // the end of the loop, since it will be re-used by the loop. 5047 if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) { 5048 Context.insert<InstFakeUse>(ValVar); 5049 } 5050 // The address base (if any) is also reused in the loop. 5051 if (Variable *Base = Addr->getBase()) 5052 Context.insert<InstFakeUse>(Base); 5053 _mov(Dest, T_eax); 5054 } 5055 5056 /// Lowers count {trailing, leading} zeros intrinsic. 5057 /// 5058 /// We could do constant folding here, but that should have 5059 /// been done by the front-end/middle-end optimizations. 5060 template <typename TraitsType> 5061 void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty, 5062 Variable *Dest, 5063 Operand *FirstVal, 5064 Operand *SecondVal) { 5065 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI). 5066 // Then the instructions will handle the Val == 0 case much more simply 5067 // and won't require conversion from bit position to number of zeros. 5068 // 5069 // Otherwise: 5070 // bsr IF_NOT_ZERO, Val 5071 // mov T_DEST, ((Ty == i32) ? 63 : 127) 5072 // cmovne T_DEST, IF_NOT_ZERO 5073 // xor T_DEST, ((Ty == i32) ? 31 : 63) 5074 // mov DEST, T_DEST 5075 // 5076 // NOTE: T_DEST must be a register because cmov requires its dest to be a 5077 // register. Also, bsf and bsr require their dest to be a register. 5078 // 5079 // The xor DEST, C(31|63) converts a bit position to # of leading zeroes. 5080 // E.g., for 000... 00001100, bsr will say that the most significant bit 5081 // set is at position 3, while the number of leading zeros is 28. Xor is 5082 // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the 5083 // all-zeros case). 5084 // 5085 // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32 5086 // bits are all zero, and compute the result for that case (checking the 5087 // lower 32 bits). Then actually compute the result for the upper bits and 5088 // cmov in the result from the lower computation if the earlier speculation 5089 // was correct. 5090 // 5091 // Cttz, is similar, but uses bsf instead, and doesn't require the xor 5092 // bit position conversion, and the speculation is reversed. 5093 5094 // TODO(jpp): refactor this method. 5095 assert(Ty == IceType_i32 || Ty == IceType_i64); 5096 const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32; 5097 Variable *T = makeReg(DestTy); 5098 Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg); 5099 if (Cttz) { 5100 _bsf(T, FirstValRM); 5101 } else { 5102 _bsr(T, FirstValRM); 5103 } 5104 Variable *T_Dest = makeReg(DestTy); 5105 Constant *_31 = Ctx->getConstantInt32(31); 5106 Constant *_32 = Ctx->getConstantInt(DestTy, 32); 5107 Constant *_63 = Ctx->getConstantInt(DestTy, 63); 5108 Constant *_64 = Ctx->getConstantInt(DestTy, 64); 5109 if (Cttz) { 5110 if (DestTy == IceType_i64) { 5111 _mov(T_Dest, _64); 5112 } else { 5113 _mov(T_Dest, _32); 5114 } 5115 } else { 5116 Constant *_127 = Ctx->getConstantInt(DestTy, 127); 5117 if (DestTy == IceType_i64) { 5118 _mov(T_Dest, _127); 5119 } else { 5120 _mov(T_Dest, _63); 5121 } 5122 } 5123 _cmov(T_Dest, T, Traits::Cond::Br_ne); 5124 if (!Cttz) { 5125 if (DestTy == IceType_i64) { 5126 // Even though there's a _63 available at this point, that constant might 5127 // not be an i32, which will cause the xor emission to fail. 5128 Constant *_63 = Ctx->getConstantInt32(63); 5129 _xor(T_Dest, _63); 5130 } else { 5131 _xor(T_Dest, _31); 5132 } 5133 } 5134 if (Traits::Is64Bit || Ty == IceType_i32) { 5135 _mov(Dest, T_Dest); 5136 return; 5137 } 5138 _add(T_Dest, _32); 5139 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 5140 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 5141 // Will be using "test" on this, so we need a registerized variable. 5142 Variable *SecondVar = legalizeToReg(SecondVal); 5143 Variable *T_Dest2 = makeReg(IceType_i32); 5144 if (Cttz) { 5145 _bsf(T_Dest2, SecondVar); 5146 } else { 5147 _bsr(T_Dest2, SecondVar); 5148 _xor(T_Dest2, _31); 5149 } 5150 _test(SecondVar, SecondVar); 5151 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); 5152 _mov(DestLo, T_Dest2); 5153 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); 5154 } 5155 5156 template <typename TraitsType> 5157 void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest, 5158 Variable *Base, Constant *Offset) { 5159 // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to 5160 // legalize Mem properly. 5161 if (Offset) 5162 assert(!llvm::isa<ConstantRelocatable>(Offset)); 5163 5164 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset); 5165 5166 if (isVectorType(Ty)) 5167 _movp(Dest, Mem); 5168 else if (Ty == IceType_f64) 5169 _movq(Dest, Mem); 5170 else 5171 _mov(Dest, Mem); 5172 } 5173 5174 template <typename TraitsType> 5175 void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value, 5176 Variable *Base, Constant *Offset) { 5177 // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to 5178 // legalize Mem properly. 5179 if (Offset) 5180 assert(!llvm::isa<ConstantRelocatable>(Offset)); 5181 5182 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset); 5183 5184 if (isVectorType(Ty)) 5185 _storep(Value, Mem); 5186 else if (Ty == IceType_f64) 5187 _storeq(Value, Mem); 5188 else 5189 _store(Value, Mem); 5190 } 5191 5192 template <typename TraitsType> 5193 void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest, 5194 Variable *Src, int32_t OffsetAmt) { 5195 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 5196 // TODO(ascull): this or add nullptr test to _movp, _movq 5197 Variable *Data = makeReg(Ty); 5198 5199 typedLoad(Ty, Data, Src, Offset); 5200 typedStore(Ty, Data, Dest, Offset); 5201 } 5202 5203 template <typename TraitsType> 5204 void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src, 5205 Operand *Count) { 5206 // There is a load and store for each chunk in the unroll 5207 constexpr uint32_t BytesPerStorep = 16; 5208 5209 // Check if the operands are constants 5210 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 5211 const bool IsCountConst = CountConst != nullptr; 5212 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 5213 5214 if (shouldOptimizeMemIntrins() && IsCountConst && 5215 CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) { 5216 // Unlikely, but nothing to do if it does happen 5217 if (CountValue == 0) 5218 return; 5219 5220 Variable *SrcBase = legalizeToReg(Src); 5221 Variable *DestBase = legalizeToReg(Dest); 5222 5223 // Find the largest type that can be used and use it as much as possible in 5224 // reverse order. Then handle any remainder with overlapping copies. Since 5225 // the remainder will be at the end, there will be reduced pressure on the 5226 // memory unit as the accesses to the same memory are far apart. 5227 Type Ty = largestTypeInSize(CountValue); 5228 uint32_t TyWidth = typeWidthInBytes(Ty); 5229 5230 uint32_t RemainingBytes = CountValue; 5231 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; 5232 while (RemainingBytes >= TyWidth) { 5233 copyMemory(Ty, DestBase, SrcBase, Offset); 5234 RemainingBytes -= TyWidth; 5235 Offset -= TyWidth; 5236 } 5237 5238 if (RemainingBytes == 0) 5239 return; 5240 5241 // Lower the remaining bytes. Adjust to larger types in order to make use 5242 // of overlaps in the copies. 5243 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes); 5244 Offset = CountValue - typeWidthInBytes(LeftOverTy); 5245 copyMemory(LeftOverTy, DestBase, SrcBase, Offset); 5246 return; 5247 } 5248 5249 // Fall back on a function call 5250 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3); 5251 Call->addArg(Dest); 5252 Call->addArg(Src); 5253 Call->addArg(Count); 5254 lowerCall(Call); 5255 } 5256 5257 template <typename TraitsType> 5258 void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src, 5259 Operand *Count) { 5260 // There is a load and store for each chunk in the unroll 5261 constexpr uint32_t BytesPerStorep = 16; 5262 5263 // Check if the operands are constants 5264 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 5265 const bool IsCountConst = CountConst != nullptr; 5266 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 5267 5268 if (shouldOptimizeMemIntrins() && IsCountConst && 5269 CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) { 5270 // Unlikely, but nothing to do if it does happen 5271 if (CountValue == 0) 5272 return; 5273 5274 Variable *SrcBase = legalizeToReg(Src); 5275 Variable *DestBase = legalizeToReg(Dest); 5276 5277 std::tuple<Type, Constant *, Variable *> 5278 Moves[Traits::MEMMOVE_UNROLL_LIMIT]; 5279 Constant *Offset; 5280 Variable *Reg; 5281 5282 // Copy the data into registers as the source and destination could overlap 5283 // so make sure not to clobber the memory. This also means overlapping 5284 // moves can be used as we are taking a safe snapshot of the memory. 5285 Type Ty = largestTypeInSize(CountValue); 5286 uint32_t TyWidth = typeWidthInBytes(Ty); 5287 5288 uint32_t RemainingBytes = CountValue; 5289 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth; 5290 size_t N = 0; 5291 while (RemainingBytes >= TyWidth) { 5292 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); 5293 Offset = Ctx->getConstantInt32(OffsetAmt); 5294 Reg = makeReg(Ty); 5295 typedLoad(Ty, Reg, SrcBase, Offset); 5296 RemainingBytes -= TyWidth; 5297 OffsetAmt -= TyWidth; 5298 Moves[N++] = std::make_tuple(Ty, Offset, Reg); 5299 } 5300 5301 if (RemainingBytes != 0) { 5302 // Lower the remaining bytes. Adjust to larger types in order to make use 5303 // of overlaps in the copies. 5304 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); 5305 Ty = firstTypeThatFitsSize(RemainingBytes); 5306 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty)); 5307 Reg = makeReg(Ty); 5308 typedLoad(Ty, Reg, SrcBase, Offset); 5309 Moves[N++] = std::make_tuple(Ty, Offset, Reg); 5310 } 5311 5312 // Copy the data out into the destination memory 5313 for (size_t i = 0; i < N; ++i) { 5314 std::tie(Ty, Offset, Reg) = Moves[i]; 5315 typedStore(Ty, Reg, DestBase, Offset); 5316 } 5317 5318 return; 5319 } 5320 5321 // Fall back on a function call 5322 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3); 5323 Call->addArg(Dest); 5324 Call->addArg(Src); 5325 Call->addArg(Count); 5326 lowerCall(Call); 5327 } 5328 5329 template <typename TraitsType> 5330 void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val, 5331 Operand *Count) { 5332 constexpr uint32_t BytesPerStorep = 16; 5333 constexpr uint32_t BytesPerStoreq = 8; 5334 constexpr uint32_t BytesPerStorei32 = 4; 5335 assert(Val->getType() == IceType_i8); 5336 5337 // Check if the operands are constants 5338 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 5339 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); 5340 const bool IsCountConst = CountConst != nullptr; 5341 const bool IsValConst = ValConst != nullptr; 5342 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 5343 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; 5344 5345 // Unlikely, but nothing to do if it does happen 5346 if (IsCountConst && CountValue == 0) 5347 return; 5348 5349 // TODO(ascull): if the count is constant but val is not it would be possible 5350 // to inline by spreading the value across 4 bytes and accessing subregs e.g. 5351 // eax, ax and al. 5352 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) { 5353 Variable *Base = nullptr; 5354 Variable *VecReg = nullptr; 5355 const uint32_t MaskValue = (ValValue & 0xff); 5356 const uint32_t SpreadValue = 5357 (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue; 5358 5359 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, 5360 uint32_t OffsetAmt) { 5361 assert(Base != nullptr); 5362 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 5363 5364 // TODO(ascull): is 64-bit better with vector or scalar movq? 5365 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset); 5366 if (isVectorType(Ty)) { 5367 assert(VecReg != nullptr); 5368 _storep(VecReg, Mem); 5369 } else if (Ty == IceType_f64) { 5370 assert(VecReg != nullptr); 5371 _storeq(VecReg, Mem); 5372 } else { 5373 assert(Ty != IceType_i64); 5374 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); 5375 } 5376 }; 5377 5378 // Find the largest type that can be used and use it as much as possible in 5379 // reverse order. Then handle any remainder with overlapping copies. Since 5380 // the remainder will be at the end, there will be reduces pressure on the 5381 // memory unit as the access to the same memory are far apart. 5382 Type Ty = IceType_void; 5383 if (ValValue == 0 && CountValue >= BytesPerStoreq && 5384 CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) { 5385 // When the value is zero it can be loaded into a vector register cheaply 5386 // using the xor trick. 5387 Base = legalizeToReg(Dest); 5388 VecReg = makeVectorOfZeros(IceType_v16i8); 5389 Ty = largestTypeInSize(CountValue); 5390 } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) { 5391 // When the value is non-zero or the count is small we can't use vector 5392 // instructions so are limited to 32-bit stores. 5393 Base = legalizeToReg(Dest); 5394 constexpr uint32_t MaxSize = 4; 5395 Ty = largestTypeInSize(CountValue, MaxSize); 5396 } 5397 5398 if (Base) { 5399 uint32_t TyWidth = typeWidthInBytes(Ty); 5400 5401 uint32_t RemainingBytes = CountValue; 5402 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; 5403 while (RemainingBytes >= TyWidth) { 5404 lowerSet(Ty, Offset); 5405 RemainingBytes -= TyWidth; 5406 Offset -= TyWidth; 5407 } 5408 5409 if (RemainingBytes == 0) 5410 return; 5411 5412 // Lower the remaining bytes. Adjust to larger types in order to make use 5413 // of overlaps in the copies. 5414 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes); 5415 Offset = CountValue - typeWidthInBytes(LeftOverTy); 5416 lowerSet(LeftOverTy, Offset); 5417 return; 5418 } 5419 } 5420 5421 // Fall back on calling the memset function. The value operand needs to be 5422 // extended to a stack slot size because the PNaCl ABI requires arguments to 5423 // be at least 32 bits wide. 5424 Operand *ValExt; 5425 if (IsValConst) { 5426 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); 5427 } else { 5428 Variable *ValExtVar = Func->makeVariable(stackSlotType()); 5429 lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val)); 5430 ValExt = ValExtVar; 5431 } 5432 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3); 5433 Call->addArg(Dest); 5434 Call->addArg(ValExt); 5435 Call->addArg(Count); 5436 lowerCall(Call); 5437 } 5438 5439 class AddressOptimizer { 5440 AddressOptimizer() = delete; 5441 AddressOptimizer(const AddressOptimizer &) = delete; 5442 AddressOptimizer &operator=(const AddressOptimizer &) = delete; 5443 5444 public: 5445 explicit AddressOptimizer(const Cfg *Func) 5446 : Func(Func), VMetadata(Func->getVMetadata()) {} 5447 5448 inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable, 5449 int32_t Offset, const Variable *Base, 5450 const Variable *Index, uint16_t Shift, 5451 const Inst *Reason) const; 5452 5453 inline const Inst *matchAssign(Variable **Var, 5454 ConstantRelocatable **Relocatable, 5455 int32_t *Offset); 5456 5457 inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index, 5458 uint16_t *Shift); 5459 5460 inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift); 5461 5462 inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase, 5463 const uint16_t Shift, 5464 ConstantRelocatable **Relocatable, 5465 int32_t *Offset); 5466 5467 private: 5468 const Cfg *const Func; 5469 const VariablesMetadata *const VMetadata; 5470 5471 static bool isAdd(const Inst *Instr) { 5472 if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) { 5473 return (Arith->getOp() == InstArithmetic::Add); 5474 } 5475 return false; 5476 } 5477 }; 5478 5479 void AddressOptimizer::dumpAddressOpt( 5480 const ConstantRelocatable *const Relocatable, int32_t Offset, 5481 const Variable *Base, const Variable *Index, uint16_t Shift, 5482 const Inst *Reason) const { 5483 if (!BuildDefs::dump()) 5484 return; 5485 if (!Func->isVerbose(IceV_AddrOpt)) 5486 return; 5487 OstreamLocker L(Func->getContext()); 5488 Ostream &Str = Func->getContext()->getStrDump(); 5489 Str << "Instruction: "; 5490 Reason->dumpDecorated(Func); 5491 Str << " results in Base="; 5492 if (Base) 5493 Base->dump(Func); 5494 else 5495 Str << "<null>"; 5496 Str << ", Index="; 5497 if (Index) 5498 Index->dump(Func); 5499 else 5500 Str << "<null>"; 5501 Str << ", Shift=" << Shift << ", Offset=" << Offset 5502 << ", Relocatable=" << Relocatable << "\n"; 5503 } 5504 5505 const Inst *AddressOptimizer::matchAssign(Variable **Var, 5506 ConstantRelocatable **Relocatable, 5507 int32_t *Offset) { 5508 // Var originates from Var=SrcVar ==> set Var:=SrcVar 5509 if (*Var == nullptr) 5510 return nullptr; 5511 if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) { 5512 assert(!VMetadata->isMultiDef(*Var)); 5513 if (llvm::isa<InstAssign>(VarAssign)) { 5514 Operand *SrcOp = VarAssign->getSrc(0); 5515 assert(SrcOp); 5516 if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) { 5517 if (!VMetadata->isMultiDef(SrcVar) && 5518 // TODO: ensure SrcVar stays single-BB 5519 true) { 5520 *Var = SrcVar; 5521 return VarAssign; 5522 } 5523 } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) { 5524 int32_t MoreOffset = Const->getValue(); 5525 if (Utils::WouldOverflowAdd(*Offset, MoreOffset)) 5526 return nullptr; 5527 *Var = nullptr; 5528 *Offset += MoreOffset; 5529 return VarAssign; 5530 } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) { 5531 if (*Relocatable == nullptr) { 5532 // It is always safe to fold a relocatable through assignment -- the 5533 // assignment frees a slot in the address operand that can be used to 5534 // hold the Sandbox Pointer -- if any. 5535 *Var = nullptr; 5536 *Relocatable = AddReloc; 5537 return VarAssign; 5538 } 5539 } 5540 } 5541 } 5542 return nullptr; 5543 } 5544 5545 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base, 5546 Variable **Index, 5547 uint16_t *Shift) { 5548 // Index==nullptr && Base is Base=Var1+Var2 ==> 5549 // set Base=Var1, Index=Var2, Shift=0 5550 if (*Base == nullptr) 5551 return nullptr; 5552 if (*Index != nullptr) 5553 return nullptr; 5554 auto *BaseInst = VMetadata->getSingleDefinition(*Base); 5555 if (BaseInst == nullptr) 5556 return nullptr; 5557 assert(!VMetadata->isMultiDef(*Base)); 5558 if (BaseInst->getSrcSize() < 2) 5559 return nullptr; 5560 if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) { 5561 if (VMetadata->isMultiDef(Var1)) 5562 return nullptr; 5563 if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) { 5564 if (VMetadata->isMultiDef(Var2)) 5565 return nullptr; 5566 if (isAdd(BaseInst) && 5567 // TODO: ensure Var1 and Var2 stay single-BB 5568 true) { 5569 *Base = Var1; 5570 *Index = Var2; 5571 *Shift = 0; // should already have been 0 5572 return BaseInst; 5573 } 5574 } 5575 } 5576 return nullptr; 5577 } 5578 5579 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index, 5580 uint16_t *Shift) { 5581 // Index is Index=Var*Const && log2(Const)+Shift<=3 ==> 5582 // Index=Var, Shift+=log2(Const) 5583 if (*Index == nullptr) 5584 return nullptr; 5585 auto *IndexInst = VMetadata->getSingleDefinition(*Index); 5586 if (IndexInst == nullptr) 5587 return nullptr; 5588 assert(!VMetadata->isMultiDef(*Index)); 5589 5590 // When using an unsigned 32-bit array index on x64, it gets zero-extended 5591 // before the shift & add. The explicit zero extension can be eliminated 5592 // because x86 32-bit operations automatically get zero-extended into the 5593 // corresponding 64-bit register. 5594 if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) { 5595 if (CastInst->getCastKind() == InstCast::Zext) { 5596 if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) { 5597 if (Var->getType() == IceType_i32 && 5598 CastInst->getDest()->getType() == IceType_i64) { 5599 IndexInst = VMetadata->getSingleDefinition(Var); 5600 } 5601 } 5602 } 5603 } 5604 5605 if (IndexInst->getSrcSize() < 2) 5606 return nullptr; 5607 if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) { 5608 if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) { 5609 if (auto *Const = 5610 llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) { 5611 if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32) 5612 return nullptr; 5613 switch (ArithInst->getOp()) { 5614 default: 5615 return nullptr; 5616 case InstArithmetic::Mul: { 5617 uint32_t Mult = Const->getValue(); 5618 uint32_t LogMult; 5619 switch (Mult) { 5620 case 1: 5621 LogMult = 0; 5622 break; 5623 case 2: 5624 LogMult = 1; 5625 break; 5626 case 4: 5627 LogMult = 2; 5628 break; 5629 case 8: 5630 LogMult = 3; 5631 break; 5632 default: 5633 return nullptr; 5634 } 5635 if (*Shift + LogMult <= 3) { 5636 *Index = Var; 5637 *Shift += LogMult; 5638 return IndexInst; 5639 } 5640 } 5641 case InstArithmetic::Shl: { 5642 uint32_t ShiftAmount = Const->getValue(); 5643 switch (ShiftAmount) { 5644 case 0: 5645 case 1: 5646 case 2: 5647 case 3: 5648 break; 5649 default: 5650 return nullptr; 5651 } 5652 if (*Shift + ShiftAmount <= 3) { 5653 *Index = Var; 5654 *Shift += ShiftAmount; 5655 return IndexInst; 5656 } 5657 } 5658 } 5659 } 5660 } 5661 } 5662 return nullptr; 5663 } 5664 5665 const Inst *AddressOptimizer::matchOffsetIndexOrBase( 5666 Variable **IndexOrBase, const uint16_t Shift, 5667 ConstantRelocatable **Relocatable, int32_t *Offset) { 5668 // Base is Base=Var+Const || Base is Base=Const+Var ==> 5669 // set Base=Var, Offset+=Const 5670 // Base is Base=Var-Const ==> 5671 // set Base=Var, Offset-=Const 5672 // Index is Index=Var+Const ==> 5673 // set Index=Var, Offset+=(Const<<Shift) 5674 // Index is Index=Const+Var ==> 5675 // set Index=Var, Offset+=(Const<<Shift) 5676 // Index is Index=Var-Const ==> 5677 // set Index=Var, Offset-=(Const<<Shift) 5678 // Treat Index=Var Or Const as Index=Var + Const 5679 // when Var = Var' << N and log2(Const) <= N 5680 // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N) 5681 5682 if (*IndexOrBase == nullptr) { 5683 return nullptr; 5684 } 5685 const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase); 5686 if (Definition == nullptr) { 5687 return nullptr; 5688 } 5689 assert(!VMetadata->isMultiDef(*IndexOrBase)); 5690 if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) { 5691 switch (ArithInst->getOp()) { 5692 case InstArithmetic::Add: 5693 case InstArithmetic::Sub: 5694 case InstArithmetic::Or: 5695 break; 5696 default: 5697 return nullptr; 5698 } 5699 5700 Operand *Src0 = ArithInst->getSrc(0); 5701 Operand *Src1 = ArithInst->getSrc(1); 5702 auto *Var0 = llvm::dyn_cast<Variable>(Src0); 5703 auto *Var1 = llvm::dyn_cast<Variable>(Src1); 5704 auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0); 5705 auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1); 5706 auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0); 5707 auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1); 5708 5709 bool IsAdd = false; 5710 if (ArithInst->getOp() == InstArithmetic::Or) { 5711 Variable *Var = nullptr; 5712 ConstantInteger32 *Const = nullptr; 5713 if (Var0 && Const1) { 5714 Var = Var0; 5715 Const = Const1; 5716 } else if (Const0 && Var1) { 5717 Var = Var1; 5718 Const = Const0; 5719 } else { 5720 return nullptr; 5721 } 5722 auto *VarDef = 5723 llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var)); 5724 if (VarDef == nullptr) 5725 return nullptr; 5726 5727 SizeT ZeroesAvailable = 0; 5728 if (VarDef->getOp() == InstArithmetic::Shl) { 5729 if (auto *ConstInt = 5730 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) { 5731 ZeroesAvailable = ConstInt->getValue(); 5732 } 5733 } else if (VarDef->getOp() == InstArithmetic::Mul) { 5734 SizeT PowerOfTwo = 0; 5735 if (auto *MultConst = 5736 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) { 5737 if (llvm::isPowerOf2_32(MultConst->getValue())) { 5738 PowerOfTwo += MultConst->getValue(); 5739 } 5740 } 5741 if (auto *MultConst = 5742 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) { 5743 if (llvm::isPowerOf2_32(MultConst->getValue())) { 5744 PowerOfTwo += MultConst->getValue(); 5745 } 5746 } 5747 ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1; 5748 } 5749 SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1; 5750 if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable) 5751 return nullptr; 5752 IsAdd = true; // treat it as an add if the above conditions hold 5753 } else { 5754 IsAdd = ArithInst->getOp() == InstArithmetic::Add; 5755 } 5756 5757 Variable *NewIndexOrBase = nullptr; 5758 int32_t NewOffset = 0; 5759 ConstantRelocatable *NewRelocatable = *Relocatable; 5760 if (Var0 && Var1) 5761 // TODO(sehr): merge base/index splitting into here. 5762 return nullptr; 5763 if (!IsAdd && Var1) 5764 return nullptr; 5765 if (Var0) 5766 NewIndexOrBase = Var0; 5767 else if (Var1) 5768 NewIndexOrBase = Var1; 5769 // Don't know how to add/subtract two relocatables. 5770 if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1)) 5771 return nullptr; 5772 // Don't know how to subtract a relocatable. 5773 if (!IsAdd && Reloc1) 5774 return nullptr; 5775 // Incorporate ConstantRelocatables. 5776 if (Reloc0) 5777 NewRelocatable = Reloc0; 5778 else if (Reloc1) 5779 NewRelocatable = Reloc1; 5780 // Compute the updated constant offset. 5781 if (Const0) { 5782 const int32_t MoreOffset = 5783 IsAdd ? Const0->getValue() : -Const0->getValue(); 5784 if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset)) 5785 return nullptr; 5786 NewOffset += MoreOffset; 5787 } 5788 if (Const1) { 5789 const int32_t MoreOffset = 5790 IsAdd ? Const1->getValue() : -Const1->getValue(); 5791 if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset)) 5792 return nullptr; 5793 NewOffset += MoreOffset; 5794 } 5795 if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift)) 5796 return nullptr; 5797 *IndexOrBase = NewIndexOrBase; 5798 *Offset += (NewOffset << Shift); 5799 // Shift is always zero if this is called with the base 5800 *Relocatable = NewRelocatable; 5801 return Definition; 5802 } 5803 return nullptr; 5804 } 5805 5806 template <typename TypeTraits> 5807 typename TargetX86Base<TypeTraits>::X86OperandMem * 5808 TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType, 5809 Operand *Addr) { 5810 Func->resetCurrentNode(); 5811 if (Func->isVerbose(IceV_AddrOpt)) { 5812 OstreamLocker L(Func->getContext()); 5813 Ostream &Str = Func->getContext()->getStrDump(); 5814 Str << "\nStarting computeAddressOpt for instruction:\n "; 5815 Instr->dumpDecorated(Func); 5816 } 5817 5818 OptAddr NewAddr; 5819 NewAddr.Base = llvm::dyn_cast<Variable>(Addr); 5820 if (NewAddr.Base == nullptr) 5821 return nullptr; 5822 5823 // If the Base has more than one use or is live across multiple blocks, then 5824 // don't go further. Alternatively (?), never consider a transformation that 5825 // would change a variable that is currently *not* live across basic block 5826 // boundaries into one that *is*. 5827 if (!getFlags().getLoopInvariantCodeMotion()) { 5828 // Need multi block address opt when licm is enabled. 5829 // Might make sense to restrict to current node and loop header. 5830 if (Func->getVMetadata()->isMultiBlock( 5831 NewAddr.Base) /* || Base->getUseCount() > 1*/) 5832 return nullptr; 5833 } 5834 AddressOptimizer AddrOpt(Func); 5835 const bool MockBounds = getFlags().getMockBoundsCheck(); 5836 const Inst *Reason = nullptr; 5837 bool AddressWasOptimized = false; 5838 // The following unnamed struct identifies the address mode formation steps 5839 // that could potentially create an invalid memory operand (i.e., no free 5840 // slots for RebasePtr.) We add all those variables to this struct so that we 5841 // can use memset() to reset all members to false. 5842 struct { 5843 bool AssignBase = false; 5844 bool AssignIndex = false; 5845 bool OffsetFromBase = false; 5846 bool OffsetFromIndex = false; 5847 bool CombinedBaseIndex = false; 5848 } Skip; 5849 // This points to the boolean in Skip that represents the last folding 5850 // performed. This is used to disable a pattern match that generated an 5851 // invalid address. Without this, the algorithm would never finish. 5852 bool *SkipLastFolding = nullptr; 5853 // NewAddrCheckpoint is used to rollback the address being formed in case an 5854 // invalid address is formed. 5855 OptAddr NewAddrCheckpoint; 5856 Reason = Instr; 5857 do { 5858 if (SandboxingType != ST_None) { 5859 // When sandboxing, we defer the sandboxing of NewAddr to the Concrete 5860 // Target. If our optimization was overly aggressive, then we simply undo 5861 // what the previous iteration did, and set the previous pattern's skip 5862 // bit to true. 5863 if (!legalizeOptAddrForSandbox(&NewAddr)) { 5864 *SkipLastFolding = true; 5865 SkipLastFolding = nullptr; 5866 NewAddr = NewAddrCheckpoint; 5867 Reason = nullptr; 5868 } 5869 } 5870 5871 if (Reason) { 5872 AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base, 5873 NewAddr.Index, NewAddr.Shift, Reason); 5874 AddressWasOptimized = true; 5875 Reason = nullptr; 5876 SkipLastFolding = nullptr; 5877 memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip)); 5878 } 5879 5880 NewAddrCheckpoint = NewAddr; 5881 5882 // Update Base and Index to follow through assignments to definitions. 5883 if (!Skip.AssignBase && 5884 (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable, 5885 &NewAddr.Offset))) { 5886 SkipLastFolding = &Skip.AssignBase; 5887 // Assignments of Base from a Relocatable or ConstantInt32 can result 5888 // in Base becoming nullptr. To avoid code duplication in this loop we 5889 // prefer that Base be non-nullptr if possible. 5890 if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) && 5891 NewAddr.Shift == 0) { 5892 std::swap(NewAddr.Base, NewAddr.Index); 5893 } 5894 continue; 5895 } 5896 if (!Skip.AssignBase && 5897 (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable, 5898 &NewAddr.Offset))) { 5899 SkipLastFolding = &Skip.AssignIndex; 5900 continue; 5901 } 5902 5903 if (!MockBounds) { 5904 // Transition from: 5905 // <Relocatable + Offset>(Base) to 5906 // <Relocatable + Offset>(Base, Index) 5907 if (!Skip.CombinedBaseIndex && 5908 (Reason = AddrOpt.matchCombinedBaseIndex( 5909 &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) { 5910 SkipLastFolding = &Skip.CombinedBaseIndex; 5911 continue; 5912 } 5913 5914 // Recognize multiply/shift and update Shift amount. 5915 // Index becomes Index=Var<<Const && Const+Shift<=3 ==> 5916 // Index=Var, Shift+=Const 5917 // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==> 5918 // Index=Var, Shift+=log2(Const) 5919 if ((Reason = 5920 AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) { 5921 continue; 5922 } 5923 5924 // If Shift is zero, the choice of Base and Index was purely arbitrary. 5925 // Recognize multiply/shift and set Shift amount. 5926 // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==> 5927 // swap(Index,Base) 5928 // Similar for Base=Const*Var and Base=Var<<Const 5929 if (NewAddr.Shift == 0 && 5930 (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) { 5931 std::swap(NewAddr.Base, NewAddr.Index); 5932 continue; 5933 } 5934 } 5935 5936 // Update Offset to reflect additions/subtractions with constants and 5937 // relocatables. 5938 // TODO: consider overflow issues with respect to Offset. 5939 if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase( 5940 &NewAddr.Base, /*Shift =*/0, 5941 &NewAddr.Relocatable, &NewAddr.Offset))) { 5942 SkipLastFolding = &Skip.OffsetFromBase; 5943 continue; 5944 } 5945 if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase( 5946 &NewAddr.Index, NewAddr.Shift, 5947 &NewAddr.Relocatable, &NewAddr.Offset))) { 5948 SkipLastFolding = &Skip.OffsetFromIndex; 5949 continue; 5950 } 5951 5952 break; 5953 } while (Reason); 5954 5955 if (!AddressWasOptimized) { 5956 return nullptr; 5957 } 5958 5959 // Undo any addition of RebasePtr. It will be added back when the mem 5960 // operand is sandboxed. 5961 if (NewAddr.Base == RebasePtr) { 5962 NewAddr.Base = nullptr; 5963 } 5964 5965 if (NewAddr.Index == RebasePtr) { 5966 NewAddr.Index = nullptr; 5967 NewAddr.Shift = 0; 5968 } 5969 5970 Constant *OffsetOp = nullptr; 5971 if (NewAddr.Relocatable == nullptr) { 5972 OffsetOp = Ctx->getConstantInt32(NewAddr.Offset); 5973 } else { 5974 OffsetOp = 5975 Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset, 5976 NewAddr.Relocatable->getName()); 5977 } 5978 // Vanilla ICE load instructions should not use the segment registers, and 5979 // computeAddressOpt only works at the level of Variables and Constants, not 5980 // other X86OperandMem, so there should be no mention of segment 5981 // registers there either. 5982 static constexpr auto SegmentReg = 5983 X86OperandMem::SegmentRegisters::DefaultSegment; 5984 5985 return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp, 5986 NewAddr.Index, NewAddr.Shift, SegmentReg); 5987 } 5988 5989 /// Add a mock bounds check on the memory address before using it as a load or 5990 /// store operand. The basic idea is that given a memory operand [reg], we 5991 /// would first add bounds-check code something like: 5992 /// 5993 /// cmp reg, <lb> 5994 /// jl out_of_line_error 5995 /// cmp reg, <ub> 5996 /// jg out_of_line_error 5997 /// 5998 /// In reality, the specific code will depend on how <lb> and <ub> are 5999 /// represented, e.g. an immediate, a global, or a function argument. 6000 /// 6001 /// As such, we need to enforce that the memory operand does not have the form 6002 /// [reg1+reg2], because then there is no simple cmp instruction that would 6003 /// suffice. However, we consider [reg+offset] to be OK because the offset is 6004 /// usually small, and so <ub> could have a safety buffer built in and then we 6005 /// could instead branch to a custom out_of_line_error that does the precise 6006 /// check and jumps back if it turns out OK. 6007 /// 6008 /// For the purpose of mocking the bounds check, we'll do something like this: 6009 /// 6010 /// cmp reg, 0 6011 /// je label 6012 /// cmp reg, 1 6013 /// je label 6014 /// label: 6015 /// 6016 /// Also note that we don't need to add a bounds check to a dereference of a 6017 /// simple global variable address. 6018 template <typename TraitsType> 6019 void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) { 6020 if (!getFlags().getMockBoundsCheck()) 6021 return; 6022 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) { 6023 if (Mem->getIndex()) { 6024 llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg"); 6025 } 6026 Opnd = Mem->getBase(); 6027 } 6028 // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps 6029 // something else. We only care if it is Variable. 6030 auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd); 6031 if (Var == nullptr) 6032 return; 6033 // We use lowerStore() to copy out-args onto the stack. This creates a memory 6034 // operand with the stack pointer as the base register. Don't do bounds 6035 // checks on that. 6036 if (Var->getRegNum() == getStackReg()) 6037 return; 6038 6039 auto *Label = InstX86Label::create(Func, this); 6040 _cmp(Opnd, Ctx->getConstantZero(IceType_i32)); 6041 _br(Traits::Cond::Br_e, Label); 6042 _cmp(Opnd, Ctx->getConstantInt32(1)); 6043 _br(Traits::Cond::Br_e, Label); 6044 Context.insert(Label); 6045 } 6046 6047 template <typename TraitsType> 6048 void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) { 6049 // A Load instruction can be treated the same as an Assign instruction, after 6050 // the source operand is transformed into an X86OperandMem operand. Note that 6051 // the address mode optimization already creates an X86OperandMem operand, so 6052 // it doesn't need another level of transformation. 6053 Variable *DestLoad = Load->getDest(); 6054 Type Ty = DestLoad->getType(); 6055 Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty); 6056 doMockBoundsCheck(Src0); 6057 auto *Assign = InstAssign::create(Func, DestLoad, Src0); 6058 lowerAssign(Assign); 6059 } 6060 6061 template <typename TraitsType> 6062 void TargetX86Base<TraitsType>::doAddressOptOther() { 6063 // Inverts some Icmp instructions which helps doAddressOptLoad later. 6064 // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1 6065 Inst *Instr = iteratorToInst(Context.getCur()); 6066 auto *VMetadata = Func->getVMetadata(); 6067 if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) { 6068 if (llvm::isa<Constant>(Icmp->getSrc(0)) || 6069 llvm::isa<Constant>(Icmp->getSrc(1))) 6070 return; 6071 auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0)); 6072 if (Var0 == nullptr) 6073 return; 6074 if (!VMetadata->isTracked(Var0)) 6075 return; 6076 auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0); 6077 if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def)) 6078 return; 6079 if (VMetadata->getLocalUseNode(Var0) != Context.getNode()) 6080 return; 6081 6082 auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1)); 6083 if (Var1 != nullptr && VMetadata->isTracked(Var1)) { 6084 auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1); 6085 if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) && 6086 llvm::isa<InstLoad>(Op1Def)) { 6087 return; // Both are loads 6088 } 6089 } 6090 Icmp->reverseConditionAndOperands(); 6091 } 6092 } 6093 6094 template <typename TraitsType> 6095 void TargetX86Base<TraitsType>::doAddressOptLoad() { 6096 Inst *Instr = iteratorToInst(Context.getCur()); 6097 Operand *Addr = Instr->getSrc(0); 6098 Variable *Dest = Instr->getDest(); 6099 if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) { 6100 Instr->setDeleted(); 6101 Context.insert<InstLoad>(Dest, OptAddr); 6102 } 6103 } 6104 6105 template <typename TraitsType> 6106 void TargetX86Base<TraitsType>::doAddressOptLoadSubVector() { 6107 auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur()); 6108 Operand *Addr = Intrinsic->getArg(0); 6109 Variable *Dest = Intrinsic->getDest(); 6110 if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) { 6111 Intrinsic->setDeleted(); 6112 const Ice::Intrinsics::IntrinsicInfo Info = { 6113 Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, 6114 Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F}; 6115 auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info); 6116 NewLoad->addArg(OptAddr); 6117 NewLoad->addArg(Intrinsic->getArg(1)); 6118 } 6119 } 6120 6121 template <typename TraitsType> 6122 void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) { 6123 Func->setError("Phi found in regular instruction list"); 6124 } 6125 6126 template <typename TraitsType> 6127 void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) { 6128 Variable *Reg = nullptr; 6129 if (Instr->hasRetValue()) { 6130 Operand *RetValue = legalize(Instr->getRetValue()); 6131 const Type ReturnType = RetValue->getType(); 6132 assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) || 6133 (ReturnType == IceType_i32) || (ReturnType == IceType_i64)); 6134 Reg = moveReturnValueToRegister(RetValue, ReturnType); 6135 } 6136 // Add a ret instruction even if sandboxing is enabled, because addEpilog 6137 // explicitly looks for a ret instruction as a marker for where to insert the 6138 // frame removal instructions. 6139 _ret(Reg); 6140 // Add a fake use of esp to make sure esp stays alive for the entire 6141 // function. Otherwise post-call esp adjustments get dead-code eliminated. 6142 keepEspLiveAtExit(); 6143 } 6144 6145 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2, 6146 SizeT Index3) { 6147 const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) | 6148 ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6); 6149 assert(Mask < 256); 6150 return Mask; 6151 } 6152 6153 template <typename TraitsType> 6154 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc( 6155 Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) { 6156 constexpr SizeT SrcBit = 1 << 2; 6157 assert((Index0 & SrcBit) == (Index1 & SrcBit)); 6158 assert((Index0 & SrcBit) == (Index2 & SrcBit)); 6159 assert((Index0 & SrcBit) == (Index3 & SrcBit)); 6160 (void)SrcBit; 6161 6162 const Type SrcTy = Src->getType(); 6163 auto *T = makeReg(SrcTy); 6164 auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem); 6165 auto *Mask = 6166 Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3)); 6167 _pshufd(T, SrcRM, Mask); 6168 return T; 6169 } 6170 6171 template <typename TraitsType> 6172 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc( 6173 Operand *Src0, SizeT Index0, SizeT Index1, Operand *Src1, SizeT Index2, 6174 SizeT Index3) { 6175 constexpr SizeT SrcBit = 1 << 2; 6176 assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX)); 6177 assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX)); 6178 (void)SrcBit; 6179 6180 const Type SrcTy = Src0->getType(); 6181 assert(Src1->getType() == SrcTy); 6182 auto *T = makeReg(SrcTy); 6183 auto *Src0R = legalizeToReg(Src0); 6184 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6185 auto *Mask = 6186 Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3)); 6187 _movp(T, Src0R); 6188 _shufps(T, Src1RM, Mask); 6189 return T; 6190 } 6191 6192 template <typename TraitsType> 6193 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs( 6194 Operand *Src0, SizeT Index0, Operand *Src1, SizeT Index1) { 6195 return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1, 6196 Index1, IGNORE_INDEX); 6197 } 6198 6199 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2, 6200 SizeT Index3) { 6201 constexpr SizeT SrcBit = 1 << 2; 6202 const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0); 6203 const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1); 6204 const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2); 6205 const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3); 6206 return Index0Bits | Index1Bits | Index2Bits | Index3Bits; 6207 } 6208 6209 template <typename TraitsType> 6210 GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() { 6211 GlobalString FuncName = Func->getFunctionName(); 6212 const SizeT Id = PshufbMaskCount++; 6213 if (!BuildDefs::dump() || !FuncName.hasStdString()) { 6214 return GlobalString::createWithString( 6215 Ctx, 6216 "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id)); 6217 } 6218 return GlobalString::createWithString( 6219 Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id)); 6220 } 6221 6222 template <typename TraitsType> 6223 ConstantRelocatable * 6224 TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask( 6225 int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4, 6226 int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9, 6227 int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14, 6228 int8_t Idx15) { 6229 static constexpr uint8_t NumElements = 16; 6230 const char Initializer[NumElements] = { 6231 Idx0, Idx1, Idx2, Idx3, Idx4, Idx5, Idx6, Idx7, 6232 Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15, 6233 }; 6234 6235 static constexpr Type V4VectorType = IceType_v4i32; 6236 const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType); 6237 auto *Mask = VariableDeclaration::create(Func->getGlobalPool()); 6238 GlobalString MaskName = lowerShuffleVector_NewMaskName(); 6239 Mask->setIsConstant(true); 6240 Mask->addInitializer(VariableDeclaration::DataInitializer::create( 6241 Func->getGlobalPool(), Initializer, NumElements)); 6242 Mask->setName(MaskName); 6243 // Mask needs to be 16-byte aligned, or pshufb will seg fault. 6244 Mask->setAlignment(MaskAlignment); 6245 Func->addGlobal(Mask); 6246 6247 constexpr RelocOffsetT Offset = 0; 6248 return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName)); 6249 } 6250 6251 template <typename TraitsType> 6252 void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb( 6253 Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1, 6254 int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6, 6255 int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11, 6256 int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) { 6257 const Type DestTy = Dest->getType(); 6258 static constexpr bool NotRebased = false; 6259 static constexpr Variable *NoBase = nullptr; 6260 // We use void for the memory operand instead of DestTy because using the 6261 // latter causes a validation failure: the X86 Inst layer complains that 6262 // vector mem operands could be under aligned. Thus, using void we avoid the 6263 // validation error. Note that the mask global declaration is aligned, so it 6264 // can be used as an XMM mem operand. 6265 static constexpr Type MaskType = IceType_void; 6266 #define IDX_IN_SRC(N, S) \ 6267 ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS) 6268 auto *Mask0M = X86OperandMem::create( 6269 Func, MaskType, NoBase, 6270 lowerShuffleVector_CreatePshufbMask( 6271 IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0), 6272 IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0), 6273 IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0), 6274 IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0), 6275 IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0), 6276 IDX_IN_SRC(Idx15, 0)), 6277 NotRebased); 6278 6279 auto *T0 = makeReg(DestTy); 6280 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6281 _movp(T0, Src0RM); 6282 6283 _pshufb(T0, Mask0M); 6284 6285 if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 || 6286 Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 || 6287 Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 || 6288 Idx15 >= 16) { 6289 auto *Mask1M = X86OperandMem::create( 6290 Func, MaskType, NoBase, 6291 lowerShuffleVector_CreatePshufbMask( 6292 IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1), 6293 IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1), 6294 IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1), 6295 IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1), 6296 IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1), 6297 IDX_IN_SRC(Idx15, 1)), 6298 NotRebased); 6299 #undef IDX_IN_SRC 6300 auto *T1 = makeReg(DestTy); 6301 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6302 _movp(T1, Src1RM); 6303 _pshufb(T1, Mask1M); 6304 _por(T0, T1); 6305 } 6306 6307 _movp(Dest, T0); 6308 } 6309 6310 template <typename TraitsType> 6311 void TargetX86Base<TraitsType>::lowerShuffleVector( 6312 const InstShuffleVector *Instr) { 6313 auto *Dest = Instr->getDest(); 6314 const Type DestTy = Dest->getType(); 6315 auto *Src0 = Instr->getSrc(0); 6316 auto *Src1 = Instr->getSrc(1); 6317 const SizeT NumElements = typeNumElements(DestTy); 6318 6319 auto *T = makeReg(DestTy); 6320 6321 switch (DestTy) { 6322 default: 6323 llvm::report_fatal_error("Unexpected vector type."); 6324 case IceType_v16i1: 6325 case IceType_v16i8: { 6326 static constexpr SizeT ExpectedNumElements = 16; 6327 assert(ExpectedNumElements == Instr->getNumIndexes()); 6328 (void)ExpectedNumElements; 6329 6330 if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) { 6331 auto *T = makeReg(DestTy); 6332 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6333 _movp(T, Src0RM); 6334 _punpckl(T, Src0RM); 6335 _movp(Dest, T); 6336 return; 6337 } 6338 6339 if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 6340 23)) { 6341 auto *T = makeReg(DestTy); 6342 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6343 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6344 _movp(T, Src0RM); 6345 _punpckl(T, Src1RM); 6346 _movp(Dest, T); 6347 return; 6348 } 6349 6350 if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 6351 15, 15)) { 6352 auto *T = makeReg(DestTy); 6353 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6354 _movp(T, Src0RM); 6355 _punpckh(T, Src0RM); 6356 _movp(Dest, T); 6357 return; 6358 } 6359 6360 if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 6361 15, 31)) { 6362 auto *T = makeReg(DestTy); 6363 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6364 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6365 _movp(T, Src0RM); 6366 _punpckh(T, Src1RM); 6367 _movp(Dest, T); 6368 return; 6369 } 6370 6371 if (InstructionSet < Traits::SSE4_1) { 6372 // TODO(jpp): figure out how to lower with sse2. 6373 break; 6374 } 6375 6376 const SizeT Index0 = Instr->getIndexValue(0); 6377 const SizeT Index1 = Instr->getIndexValue(1); 6378 const SizeT Index2 = Instr->getIndexValue(2); 6379 const SizeT Index3 = Instr->getIndexValue(3); 6380 const SizeT Index4 = Instr->getIndexValue(4); 6381 const SizeT Index5 = Instr->getIndexValue(5); 6382 const SizeT Index6 = Instr->getIndexValue(6); 6383 const SizeT Index7 = Instr->getIndexValue(7); 6384 const SizeT Index8 = Instr->getIndexValue(8); 6385 const SizeT Index9 = Instr->getIndexValue(9); 6386 const SizeT Index10 = Instr->getIndexValue(10); 6387 const SizeT Index11 = Instr->getIndexValue(11); 6388 const SizeT Index12 = Instr->getIndexValue(12); 6389 const SizeT Index13 = Instr->getIndexValue(13); 6390 const SizeT Index14 = Instr->getIndexValue(14); 6391 const SizeT Index15 = Instr->getIndexValue(15); 6392 6393 lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2, 6394 Index3, Index4, Index5, Index6, Index7, 6395 Index8, Index9, Index10, Index11, Index12, 6396 Index13, Index14, Index15); 6397 return; 6398 } 6399 case IceType_v8i1: 6400 case IceType_v8i16: { 6401 static constexpr SizeT ExpectedNumElements = 8; 6402 assert(ExpectedNumElements == Instr->getNumIndexes()); 6403 (void)ExpectedNumElements; 6404 6405 if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) { 6406 auto *T = makeReg(DestTy); 6407 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6408 _movp(T, Src0RM); 6409 _punpckl(T, Src0RM); 6410 _movp(Dest, T); 6411 return; 6412 } 6413 6414 if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) { 6415 auto *T = makeReg(DestTy); 6416 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6417 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6418 _movp(T, Src0RM); 6419 _punpckl(T, Src1RM); 6420 _movp(Dest, T); 6421 return; 6422 } 6423 6424 if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) { 6425 auto *T = makeReg(DestTy); 6426 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6427 _movp(T, Src0RM); 6428 _punpckh(T, Src0RM); 6429 _movp(Dest, T); 6430 return; 6431 } 6432 6433 if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) { 6434 auto *T = makeReg(DestTy); 6435 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6436 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6437 _movp(T, Src0RM); 6438 _punpckh(T, Src1RM); 6439 _movp(Dest, T); 6440 return; 6441 } 6442 6443 if (InstructionSet < Traits::SSE4_1) { 6444 // TODO(jpp): figure out how to lower with sse2. 6445 break; 6446 } 6447 6448 const SizeT Index0 = Instr->getIndexValue(0); 6449 const SizeT Index1 = Instr->getIndexValue(1); 6450 const SizeT Index2 = Instr->getIndexValue(2); 6451 const SizeT Index3 = Instr->getIndexValue(3); 6452 const SizeT Index4 = Instr->getIndexValue(4); 6453 const SizeT Index5 = Instr->getIndexValue(5); 6454 const SizeT Index6 = Instr->getIndexValue(6); 6455 const SizeT Index7 = Instr->getIndexValue(7); 6456 6457 #define TO_BYTE_INDEX(I) ((I) << 1) 6458 lowerShuffleVector_UsingPshufb( 6459 Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1, 6460 TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2), 6461 TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3), 6462 TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4), 6463 TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5), 6464 TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6), 6465 TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7), 6466 TO_BYTE_INDEX(Index7) + 1); 6467 #undef TO_BYTE_INDEX 6468 return; 6469 } 6470 case IceType_v4i1: 6471 case IceType_v4i32: 6472 case IceType_v4f32: { 6473 static constexpr SizeT ExpectedNumElements = 4; 6474 assert(ExpectedNumElements == Instr->getNumIndexes()); 6475 const SizeT Index0 = Instr->getIndexValue(0); 6476 const SizeT Index1 = Instr->getIndexValue(1); 6477 const SizeT Index2 = Instr->getIndexValue(2); 6478 const SizeT Index3 = Instr->getIndexValue(3); 6479 Variable *T = nullptr; 6480 switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) { 6481 #define CASE_SRCS_IN(S0, S1, S2, S3) \ 6482 case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3)) 6483 CASE_SRCS_IN(0, 0, 0, 0) : { 6484 T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2, 6485 Index3); 6486 } 6487 break; 6488 CASE_SRCS_IN(0, 0, 0, 1) : { 6489 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2, 6490 Src1, Index3); 6491 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified, 6492 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6493 } 6494 break; 6495 CASE_SRCS_IN(0, 0, 1, 0) : { 6496 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2, 6497 Src0, Index3); 6498 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified, 6499 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6500 } 6501 break; 6502 CASE_SRCS_IN(0, 0, 1, 1) : { 6503 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1, 6504 Index2, Index3); 6505 } 6506 break; 6507 CASE_SRCS_IN(0, 1, 0, 0) : { 6508 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0, 6509 Src1, Index1); 6510 T = lowerShuffleVector_TwoFromSameSrc( 6511 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3); 6512 } 6513 break; 6514 CASE_SRCS_IN(0, 1, 0, 1) : { 6515 if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 && 6516 (Index3 - ExpectedNumElements) == 1) { 6517 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6518 auto *Src0R = legalizeToReg(Src0); 6519 T = makeReg(DestTy); 6520 _movp(T, Src0R); 6521 _punpckl(T, Src1RM); 6522 } else if (Index0 == Index2 && Index1 == Index3) { 6523 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs( 6524 Src0, Index0, Src1, Index1); 6525 T = lowerShuffleVector_AllFromSameSrc( 6526 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0, 6527 UNIFIED_INDEX_1); 6528 } else { 6529 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs( 6530 Src0, Index0, Src1, Index1); 6531 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs( 6532 Src0, Index2, Src1, Index3); 6533 T = lowerShuffleVector_TwoFromSameSrc( 6534 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1, 6535 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6536 } 6537 } 6538 break; 6539 CASE_SRCS_IN(0, 1, 1, 0) : { 6540 if (Index0 == Index3 && Index1 == Index2) { 6541 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs( 6542 Src0, Index0, Src1, Index1); 6543 T = lowerShuffleVector_AllFromSameSrc( 6544 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1, 6545 UNIFIED_INDEX_0); 6546 } else { 6547 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs( 6548 Src0, Index0, Src1, Index1); 6549 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs( 6550 Src1, Index2, Src0, Index3); 6551 T = lowerShuffleVector_TwoFromSameSrc( 6552 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1, 6553 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6554 } 6555 } 6556 break; 6557 CASE_SRCS_IN(0, 1, 1, 1) : { 6558 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0, 6559 Src1, Index1); 6560 T = lowerShuffleVector_TwoFromSameSrc( 6561 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3); 6562 } 6563 break; 6564 CASE_SRCS_IN(1, 0, 0, 0) : { 6565 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0, 6566 Src0, Index1); 6567 T = lowerShuffleVector_TwoFromSameSrc( 6568 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3); 6569 } 6570 break; 6571 CASE_SRCS_IN(1, 0, 0, 1) : { 6572 if (Index0 == Index3 && Index1 == Index2) { 6573 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs( 6574 Src1, Index0, Src0, Index1); 6575 T = lowerShuffleVector_AllFromSameSrc( 6576 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1, 6577 UNIFIED_INDEX_0); 6578 } else { 6579 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs( 6580 Src1, Index0, Src0, Index1); 6581 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs( 6582 Src0, Index2, Src1, Index3); 6583 T = lowerShuffleVector_TwoFromSameSrc( 6584 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1, 6585 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6586 } 6587 } 6588 break; 6589 CASE_SRCS_IN(1, 0, 1, 0) : { 6590 if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 && 6591 (Index2 - ExpectedNumElements) == 1 && Index3 == 1) { 6592 auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem); 6593 auto *Src0R = legalizeToReg(Src1); 6594 T = makeReg(DestTy); 6595 _movp(T, Src0R); 6596 _punpckl(T, Src1RM); 6597 } else if (Index0 == Index2 && Index1 == Index3) { 6598 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs( 6599 Src1, Index0, Src0, Index1); 6600 T = lowerShuffleVector_AllFromSameSrc( 6601 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0, 6602 UNIFIED_INDEX_1); 6603 } else { 6604 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs( 6605 Src1, Index0, Src0, Index1); 6606 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs( 6607 Src1, Index2, Src0, Index3); 6608 T = lowerShuffleVector_TwoFromSameSrc( 6609 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1, 6610 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6611 } 6612 } 6613 break; 6614 CASE_SRCS_IN(1, 0, 1, 1) : { 6615 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0, 6616 Src0, Index1); 6617 T = lowerShuffleVector_TwoFromSameSrc( 6618 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3); 6619 } 6620 break; 6621 CASE_SRCS_IN(1, 1, 0, 0) : { 6622 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0, 6623 Index2, Index3); 6624 } 6625 break; 6626 CASE_SRCS_IN(1, 1, 0, 1) : { 6627 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2, 6628 Src1, Index3); 6629 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified, 6630 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6631 } 6632 break; 6633 CASE_SRCS_IN(1, 1, 1, 0) : { 6634 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2, 6635 Src0, Index3); 6636 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified, 6637 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6638 } 6639 break; 6640 CASE_SRCS_IN(1, 1, 1, 1) : { 6641 T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2, 6642 Index3); 6643 } 6644 break; 6645 #undef CASE_SRCS_IN 6646 } 6647 6648 assert(T != nullptr); 6649 assert(T->getType() == DestTy); 6650 _movp(Dest, T); 6651 return; 6652 } break; 6653 } 6654 6655 // Unoptimized shuffle. Perform a series of inserts and extracts. 6656 Context.insert<InstFakeDef>(T); 6657 const Type ElementType = typeElementType(DestTy); 6658 for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) { 6659 auto *Index = Instr->getIndex(I); 6660 const SizeT Elem = Index->getValue(); 6661 auto *ExtElmt = makeReg(ElementType); 6662 if (Elem < NumElements) { 6663 lowerExtractElement( 6664 InstExtractElement::create(Func, ExtElmt, Src0, Index)); 6665 } else { 6666 lowerExtractElement(InstExtractElement::create( 6667 Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements))); 6668 } 6669 auto *NewT = makeReg(DestTy); 6670 lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt, 6671 Ctx->getConstantInt32(I))); 6672 T = NewT; 6673 } 6674 _movp(Dest, T); 6675 } 6676 6677 template <typename TraitsType> 6678 void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) { 6679 Variable *Dest = Select->getDest(); 6680 6681 Operand *Condition = Select->getCondition(); 6682 // Handle folding opportunities. 6683 if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) { 6684 assert(Producer->isDeleted()); 6685 switch (BoolFolding<Traits>::getProducerKind(Producer)) { 6686 default: 6687 break; 6688 case BoolFolding<Traits>::PK_Icmp32: 6689 case BoolFolding<Traits>::PK_Icmp64: { 6690 lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select); 6691 return; 6692 } 6693 case BoolFolding<Traits>::PK_Fcmp: { 6694 lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select); 6695 return; 6696 } 6697 } 6698 } 6699 6700 if (isVectorType(Dest->getType())) { 6701 lowerSelectVector(Select); 6702 return; 6703 } 6704 6705 Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem); 6706 Operand *Zero = Ctx->getConstantZero(IceType_i32); 6707 _cmp(CmpResult, Zero); 6708 Operand *SrcT = Select->getTrueOperand(); 6709 Operand *SrcF = Select->getFalseOperand(); 6710 const BrCond Cond = Traits::Cond::Br_ne; 6711 lowerSelectMove(Dest, Cond, SrcT, SrcF); 6712 } 6713 6714 template <typename TraitsType> 6715 void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond, 6716 Operand *SrcT, Operand *SrcF) { 6717 Type DestTy = Dest->getType(); 6718 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) { 6719 // The cmov instruction doesn't allow 8-bit or FP operands, so we need 6720 // explicit control flow. 6721 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1: 6722 auto *Label = InstX86Label::create(Func, this); 6723 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm); 6724 _mov(Dest, SrcT); 6725 _br(Cond, Label); 6726 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm); 6727 _redefined(_mov(Dest, SrcF)); 6728 Context.insert(Label); 6729 return; 6730 } 6731 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t 6732 // But if SrcT is immediate, we might be able to do better, as the cmov 6733 // instruction doesn't allow an immediate operand: 6734 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t 6735 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) { 6736 std::swap(SrcT, SrcF); 6737 Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond); 6738 } 6739 if (!Traits::Is64Bit && DestTy == IceType_i64) { 6740 SrcT = legalizeUndef(SrcT); 6741 SrcF = legalizeUndef(SrcF); 6742 // Set the low portion. 6743 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 6744 lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF)); 6745 // Set the high portion. 6746 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 6747 lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF)); 6748 return; 6749 } 6750 6751 assert(DestTy == IceType_i16 || DestTy == IceType_i32 || 6752 (Traits::Is64Bit && DestTy == IceType_i64)); 6753 lowerSelectIntMove(Dest, Cond, SrcT, SrcF); 6754 } 6755 6756 template <typename TraitsType> 6757 void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond, 6758 Operand *SrcT, 6759 Operand *SrcF) { 6760 Variable *T = nullptr; 6761 SrcF = legalize(SrcF); 6762 _mov(T, SrcF); 6763 SrcT = legalize(SrcT, Legal_Reg | Legal_Mem); 6764 _cmov(T, SrcT, Cond); 6765 _mov(Dest, T); 6766 } 6767 6768 template <typename TraitsType> 6769 void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src, 6770 bool IsRedefinition) { 6771 assert(Dest->getType() == Src->getType()); 6772 assert(!Dest->isRematerializable()); 6773 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { 6774 Src = legalize(Src); 6775 Operand *SrcLo = loOperand(Src); 6776 Operand *SrcHi = hiOperand(Src); 6777 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 6778 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 6779 Variable *T_Lo = nullptr, *T_Hi = nullptr; 6780 _mov(T_Lo, SrcLo); 6781 _redefined(_mov(DestLo, T_Lo), IsRedefinition); 6782 _mov(T_Hi, SrcHi); 6783 _redefined(_mov(DestHi, T_Hi), IsRedefinition); 6784 } else { 6785 Operand *SrcLegal; 6786 if (Dest->hasReg()) { 6787 // If Dest already has a physical register, then only basic legalization 6788 // is needed, as the source operand can be a register, immediate, or 6789 // memory. 6790 SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum()); 6791 } else { 6792 // If Dest could be a stack operand, then RI must be a physical register 6793 // or a scalar integer immediate. 6794 SrcLegal = legalize(Src, Legal_Reg | Legal_Imm); 6795 } 6796 if (isVectorType(Dest->getType())) { 6797 _redefined(_movp(Dest, SrcLegal), IsRedefinition); 6798 } else { 6799 _redefined(_mov(Dest, SrcLegal), IsRedefinition); 6800 } 6801 } 6802 } 6803 6804 template <typename TraitsType> 6805 bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect( 6806 const InstFcmp *Fcmp, const InstSelect *Select) { 6807 Operand *CmpSrc0 = Fcmp->getSrc(0); 6808 Operand *CmpSrc1 = Fcmp->getSrc(1); 6809 Operand *SelectSrcT = Select->getTrueOperand(); 6810 Operand *SelectSrcF = Select->getFalseOperand(); 6811 Variable *SelectDest = Select->getDest(); 6812 6813 // TODO(capn): also handle swapped compare/select operand order. 6814 if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF) 6815 return false; 6816 6817 // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here. 6818 InstFcmp::FCond Condition = Fcmp->getCondition(); 6819 switch (Condition) { 6820 default: 6821 return false; 6822 case InstFcmp::True: 6823 break; 6824 case InstFcmp::False: 6825 break; 6826 case InstFcmp::Ogt: { 6827 Variable *T = makeReg(SelectDest->getType()); 6828 if (isScalarFloatingType(SelectSrcT->getType())) { 6829 _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem)); 6830 _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem)); 6831 _mov(SelectDest, T); 6832 } else { 6833 _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem)); 6834 _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem)); 6835 _movp(SelectDest, T); 6836 } 6837 return true; 6838 } break; 6839 case InstFcmp::Olt: { 6840 Variable *T = makeReg(SelectSrcT->getType()); 6841 if (isScalarFloatingType(SelectSrcT->getType())) { 6842 _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem)); 6843 _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem)); 6844 _mov(SelectDest, T); 6845 } else { 6846 _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem)); 6847 _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem)); 6848 _movp(SelectDest, T); 6849 } 6850 return true; 6851 } break; 6852 } 6853 return false; 6854 } 6855 6856 template <typename TraitsType> 6857 void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) { 6858 Variable *Dest = Icmp->getDest(); 6859 if (isVectorType(Dest->getType())) { 6860 lowerIcmpVector(Icmp); 6861 } else { 6862 constexpr Inst *Consumer = nullptr; 6863 lowerIcmpAndConsumer(Icmp, Consumer); 6864 } 6865 } 6866 6867 template <typename TraitsType> 6868 void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) { 6869 Variable *Dest = Instr->getDest(); 6870 Type DestTy = Dest->getType(); 6871 Operand *SrcT = Instr->getTrueOperand(); 6872 Operand *SrcF = Instr->getFalseOperand(); 6873 Operand *Condition = Instr->getCondition(); 6874 6875 if (!isVectorType(DestTy)) 6876 llvm::report_fatal_error("Expected a vector select"); 6877 6878 Type SrcTy = SrcT->getType(); 6879 Variable *T = makeReg(SrcTy); 6880 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem); 6881 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem); 6882 6883 if (InstructionSet >= Traits::SSE4_1) { 6884 // TODO(wala): If the condition operand is a constant, use blendps or 6885 // pblendw. 6886 // 6887 // Use blendvps or pblendvb to implement select. 6888 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || 6889 SrcTy == IceType_v4f32) { 6890 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); 6891 Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0); 6892 _movp(xmm0, ConditionRM); 6893 _psll(xmm0, Ctx->getConstantInt8(31)); 6894 _movp(T, SrcFRM); 6895 _blendvps(T, SrcTRM, xmm0); 6896 _movp(Dest, T); 6897 } else { 6898 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16); 6899 Type SignExtTy = 6900 Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8; 6901 Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0); 6902 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition)); 6903 _movp(T, SrcFRM); 6904 _pblendvb(T, SrcTRM, xmm0); 6905 _movp(Dest, T); 6906 } 6907 return; 6908 } 6909 // Lower select without Traits::SSE4.1: 6910 // a=d?b:c ==> 6911 // if elementtype(d) != i1: 6912 // d=sext(d); 6913 // a=(b&d)|(c&~d); 6914 Variable *T2 = makeReg(SrcTy); 6915 // Sign extend the condition operand if applicable. 6916 if (SrcTy == IceType_v4f32) { 6917 // The sext operation takes only integer arguments. 6918 Variable *T3 = Func->makeVariable(IceType_v4i32); 6919 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); 6920 _movp(T, T3); 6921 } else if (typeElementType(SrcTy) != IceType_i1) { 6922 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); 6923 } else { 6924 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); 6925 _movp(T, ConditionRM); 6926 } 6927 _movp(T2, T); 6928 _pand(T, SrcTRM); 6929 _pandn(T2, SrcFRM); 6930 _por(T, T2); 6931 _movp(Dest, T); 6932 6933 return; 6934 } 6935 6936 template <typename TraitsType> 6937 void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) { 6938 Operand *Value = Instr->getData(); 6939 Operand *Addr = Instr->getStoreAddress(); 6940 X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType()); 6941 doMockBoundsCheck(NewAddr); 6942 Type Ty = NewAddr->getType(); 6943 6944 if (!Traits::Is64Bit && Ty == IceType_i64) { 6945 Value = legalizeUndef(Value); 6946 Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm); 6947 _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr))); 6948 Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm); 6949 _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr))); 6950 } else if (isVectorType(Ty)) { 6951 _storep(legalizeToReg(Value), NewAddr); 6952 } else { 6953 Value = legalize(Value, Legal_Reg | Legal_Imm); 6954 _store(Value, NewAddr); 6955 } 6956 } 6957 6958 template <typename TraitsType> 6959 void TargetX86Base<TraitsType>::doAddressOptStore() { 6960 auto *Instr = llvm::cast<InstStore>(Context.getCur()); 6961 Operand *Addr = Instr->getStoreAddress(); 6962 Operand *Data = Instr->getData(); 6963 if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) { 6964 Instr->setDeleted(); 6965 auto *NewStore = Context.insert<InstStore>(Data, OptAddr); 6966 if (Instr->getDest()) 6967 NewStore->setRmwBeacon(Instr->getRmwBeacon()); 6968 } 6969 } 6970 6971 template <typename TraitsType> 6972 void TargetX86Base<TraitsType>::doAddressOptStoreSubVector() { 6973 auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur()); 6974 Operand *Addr = Intrinsic->getArg(1); 6975 Operand *Data = Intrinsic->getArg(0); 6976 if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) { 6977 Intrinsic->setDeleted(); 6978 const Ice::Intrinsics::IntrinsicInfo Info = { 6979 Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, 6980 Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T}; 6981 auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info); 6982 NewStore->addArg(Data); 6983 NewStore->addArg(OptAddr); 6984 NewStore->addArg(Intrinsic->getArg(2)); 6985 } 6986 } 6987 6988 template <typename TraitsType> 6989 Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison, 6990 uint64_t Min, uint64_t Max) { 6991 // TODO(ascull): 64-bit should not reach here but only because it is not 6992 // implemented yet. This should be able to handle the 64-bit case. 6993 assert(Traits::Is64Bit || Comparison->getType() != IceType_i64); 6994 // Subtracting 0 is a nop so don't do it 6995 if (Min != 0) { 6996 // Avoid clobbering the comparison by copying it 6997 Variable *T = nullptr; 6998 _mov(T, Comparison); 6999 _sub(T, Ctx->getConstantInt32(Min)); 7000 Comparison = T; 7001 } 7002 7003 _cmp(Comparison, Ctx->getConstantInt32(Max - Min)); 7004 7005 return Comparison; 7006 } 7007 7008 template <typename TraitsType> 7009 void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case, 7010 Operand *Comparison, 7011 bool DoneCmp, 7012 CfgNode *DefaultTarget) { 7013 switch (Case.getKind()) { 7014 case CaseCluster::JumpTable: { 7015 InstX86Label *SkipJumpTable; 7016 7017 Operand *RangeIndex = 7018 lowerCmpRange(Comparison, Case.getLow(), Case.getHigh()); 7019 if (DefaultTarget == nullptr) { 7020 // Skip over jump table logic if comparison not in range and no default 7021 SkipJumpTable = InstX86Label::create(Func, this); 7022 _br(Traits::Cond::Br_a, SkipJumpTable); 7023 } else { 7024 _br(Traits::Cond::Br_a, DefaultTarget); 7025 } 7026 7027 InstJumpTable *JumpTable = Case.getJumpTable(); 7028 Context.insert(JumpTable); 7029 7030 // Make sure the index is a register of the same width as the base 7031 Variable *Index; 7032 const Type PointerType = getPointerType(); 7033 if (RangeIndex->getType() != PointerType) { 7034 Index = makeReg(PointerType); 7035 if (RangeIndex->getType() == IceType_i64) { 7036 assert(Traits::Is64Bit); 7037 _mov(Index, RangeIndex); // trunc 7038 } else { 7039 Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem); 7040 _movzx(Index, RangeIndexRM); 7041 } 7042 } else { 7043 Index = legalizeToReg(RangeIndex); 7044 } 7045 7046 constexpr RelocOffsetT RelocOffset = 0; 7047 constexpr Variable *NoBase = nullptr; 7048 constexpr Constant *NoOffset = nullptr; 7049 auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName()); 7050 Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName); 7051 uint16_t Shift = typeWidthInBytesLog2(PointerType); 7052 constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment; 7053 7054 Variable *Target = nullptr; 7055 if (Traits::Is64Bit && NeedSandboxing) { 7056 assert(Index != nullptr && Index->getType() == IceType_i32); 7057 } 7058 7059 if (PointerType == IceType_i32) { 7060 _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset, 7061 Index, Shift, Segment)); 7062 } else { 7063 auto *Base = makeReg(IceType_i64); 7064 _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset)); 7065 _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset, 7066 Index, Shift, Segment)); 7067 } 7068 7069 lowerIndirectJump(Target); 7070 7071 if (DefaultTarget == nullptr) 7072 Context.insert(SkipJumpTable); 7073 return; 7074 } 7075 case CaseCluster::Range: { 7076 if (Case.isUnitRange()) { 7077 // Single item 7078 if (!DoneCmp) { 7079 Constant *Value = Ctx->getConstantInt32(Case.getLow()); 7080 _cmp(Comparison, Value); 7081 } 7082 _br(Traits::Cond::Br_e, Case.getTarget()); 7083 } else if (DoneCmp && Case.isPairRange()) { 7084 // Range of two items with first item aleady compared against 7085 _br(Traits::Cond::Br_e, Case.getTarget()); 7086 Constant *Value = Ctx->getConstantInt32(Case.getHigh()); 7087 _cmp(Comparison, Value); 7088 _br(Traits::Cond::Br_e, Case.getTarget()); 7089 } else { 7090 // Range 7091 lowerCmpRange(Comparison, Case.getLow(), Case.getHigh()); 7092 _br(Traits::Cond::Br_be, Case.getTarget()); 7093 } 7094 if (DefaultTarget != nullptr) 7095 _br(DefaultTarget); 7096 return; 7097 } 7098 } 7099 } 7100 7101 template <typename TraitsType> 7102 void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) { 7103 // Group cases together and navigate through them with a binary search 7104 CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr); 7105 Operand *Src0 = Instr->getComparison(); 7106 CfgNode *DefaultTarget = Instr->getLabelDefault(); 7107 7108 assert(CaseClusters.size() != 0); // Should always be at least one 7109 7110 if (!Traits::Is64Bit && Src0->getType() == IceType_i64) { 7111 Src0 = legalize(Src0); // get Base/Index into physical registers 7112 Operand *Src0Lo = loOperand(Src0); 7113 Operand *Src0Hi = hiOperand(Src0); 7114 if (CaseClusters.back().getHigh() > UINT32_MAX) { 7115 // TODO(ascull): handle 64-bit case properly (currently naive version) 7116 // This might be handled by a higher level lowering of switches. 7117 SizeT NumCases = Instr->getNumCases(); 7118 if (NumCases >= 2) { 7119 Src0Lo = legalizeToReg(Src0Lo); 7120 Src0Hi = legalizeToReg(Src0Hi); 7121 } else { 7122 Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem); 7123 Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem); 7124 } 7125 for (SizeT I = 0; I < NumCases; ++I) { 7126 Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I)); 7127 Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32); 7128 InstX86Label *Label = InstX86Label::create(Func, this); 7129 _cmp(Src0Lo, ValueLo); 7130 _br(Traits::Cond::Br_ne, Label); 7131 _cmp(Src0Hi, ValueHi); 7132 _br(Traits::Cond::Br_e, Instr->getLabel(I)); 7133 Context.insert(Label); 7134 } 7135 _br(Instr->getLabelDefault()); 7136 return; 7137 } else { 7138 // All the values are 32-bit so just check the operand is too and then 7139 // fall through to the 32-bit implementation. This is a common case. 7140 Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem); 7141 Constant *Zero = Ctx->getConstantInt32(0); 7142 _cmp(Src0Hi, Zero); 7143 _br(Traits::Cond::Br_ne, DefaultTarget); 7144 Src0 = Src0Lo; 7145 } 7146 } 7147 7148 // 32-bit lowering 7149 7150 if (CaseClusters.size() == 1) { 7151 // Jump straight to default if needed. Currently a common case as jump 7152 // tables occur on their own. 7153 constexpr bool DoneCmp = false; 7154 lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget); 7155 return; 7156 } 7157 7158 // Going to be using multiple times so get it in a register early 7159 Variable *Comparison = legalizeToReg(Src0); 7160 7161 // A span is over the clusters 7162 struct SearchSpan { 7163 SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label) 7164 : Begin(Begin), Size(Size), Label(Label) {} 7165 7166 SizeT Begin; 7167 SizeT Size; 7168 InstX86Label *Label; 7169 }; 7170 // The stack will only grow to the height of the tree so 12 should be plenty 7171 std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack; 7172 SearchSpanStack.emplace(0, CaseClusters.size(), nullptr); 7173 bool DoneCmp = false; 7174 7175 while (!SearchSpanStack.empty()) { 7176 SearchSpan Span = SearchSpanStack.top(); 7177 SearchSpanStack.pop(); 7178 7179 if (Span.Label != nullptr) 7180 Context.insert(Span.Label); 7181 7182 switch (Span.Size) { 7183 case 0: 7184 llvm::report_fatal_error("Invalid SearchSpan size"); 7185 break; 7186 7187 case 1: 7188 lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp, 7189 SearchSpanStack.empty() ? nullptr : DefaultTarget); 7190 DoneCmp = false; 7191 break; 7192 7193 case 2: { 7194 const CaseCluster *CaseA = &CaseClusters[Span.Begin]; 7195 const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1]; 7196 7197 // Placing a range last may allow register clobbering during the range 7198 // test. That means there is no need to clone the register. If it is a 7199 // unit range the comparison may have already been done in the binary 7200 // search (DoneCmp) and so it should be placed first. If this is a range 7201 // of two items and the comparison with the low value has already been 7202 // done, comparing with the other element is cheaper than a range test. 7203 // If the low end of the range is zero then there is no subtraction and 7204 // nothing to be gained. 7205 if (!CaseA->isUnitRange() && 7206 !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) { 7207 std::swap(CaseA, CaseB); 7208 DoneCmp = false; 7209 } 7210 7211 lowerCaseCluster(*CaseA, Comparison, DoneCmp); 7212 DoneCmp = false; 7213 lowerCaseCluster(*CaseB, Comparison, DoneCmp, 7214 SearchSpanStack.empty() ? nullptr : DefaultTarget); 7215 } break; 7216 7217 default: 7218 // Pick the middle item and branch b or ae 7219 SizeT PivotIndex = Span.Begin + (Span.Size / 2); 7220 const CaseCluster &Pivot = CaseClusters[PivotIndex]; 7221 Constant *Value = Ctx->getConstantInt32(Pivot.getLow()); 7222 InstX86Label *Label = InstX86Label::create(Func, this); 7223 _cmp(Comparison, Value); 7224 // TODO(ascull): does it alway have to be far? 7225 _br(Traits::Cond::Br_b, Label, InstX86Br::Far); 7226 // Lower the left and (pivot+right) sides, falling through to the right 7227 SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label); 7228 SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr); 7229 DoneCmp = true; 7230 break; 7231 } 7232 } 7233 7234 _br(DefaultTarget); 7235 } 7236 7237 /// The following pattern occurs often in lowered C and C++ code: 7238 /// 7239 /// %cmp = fcmp/icmp pred <n x ty> %src0, %src1 7240 /// %cmp.ext = sext <n x i1> %cmp to <n x ty> 7241 /// 7242 /// We can eliminate the sext operation by copying the result of pcmpeqd, 7243 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the 7244 /// sext operation. 7245 template <typename TraitsType> 7246 void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction( 7247 Variable *SignExtendedResult) { 7248 if (auto *NextCast = 7249 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) { 7250 if (NextCast->getCastKind() == InstCast::Sext && 7251 NextCast->getSrc(0) == SignExtendedResult) { 7252 NextCast->setDeleted(); 7253 _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult)); 7254 // Skip over the instruction. 7255 Context.advanceNext(); 7256 } 7257 } 7258 } 7259 7260 template <typename TraitsType> 7261 void TargetX86Base<TraitsType>::lowerUnreachable( 7262 const InstUnreachable * /*Instr*/) { 7263 _ud2(); 7264 // Add a fake use of esp to make sure esp adjustments after the unreachable 7265 // do not get dead-code eliminated. 7266 keepEspLiveAtExit(); 7267 } 7268 7269 template <typename TraitsType> 7270 void TargetX86Base<TraitsType>::lowerBreakpoint( 7271 const InstBreakpoint * /*Instr*/) { 7272 _int3(); 7273 } 7274 7275 template <typename TraitsType> 7276 void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) { 7277 // If the beacon variable's live range does not end in this instruction, then 7278 // it must end in the modified Store instruction that follows. This means 7279 // that the original Store instruction is still there, either because the 7280 // value being stored is used beyond the Store instruction, or because dead 7281 // code elimination did not happen. In either case, we cancel RMW lowering 7282 // (and the caller deletes the RMW instruction). 7283 if (!RMW->isLastUse(RMW->getBeacon())) 7284 return; 7285 Operand *Src = RMW->getData(); 7286 Type Ty = Src->getType(); 7287 X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty); 7288 doMockBoundsCheck(Addr); 7289 if (!Traits::Is64Bit && Ty == IceType_i64) { 7290 Src = legalizeUndef(Src); 7291 Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm); 7292 Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm); 7293 auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr)); 7294 auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr)); 7295 switch (RMW->getOp()) { 7296 default: 7297 // TODO(stichnot): Implement other arithmetic operators. 7298 break; 7299 case InstArithmetic::Add: 7300 _add_rmw(AddrLo, SrcLo); 7301 _adc_rmw(AddrHi, SrcHi); 7302 return; 7303 case InstArithmetic::Sub: 7304 _sub_rmw(AddrLo, SrcLo); 7305 _sbb_rmw(AddrHi, SrcHi); 7306 return; 7307 case InstArithmetic::And: 7308 _and_rmw(AddrLo, SrcLo); 7309 _and_rmw(AddrHi, SrcHi); 7310 return; 7311 case InstArithmetic::Or: 7312 _or_rmw(AddrLo, SrcLo); 7313 _or_rmw(AddrHi, SrcHi); 7314 return; 7315 case InstArithmetic::Xor: 7316 _xor_rmw(AddrLo, SrcLo); 7317 _xor_rmw(AddrHi, SrcHi); 7318 return; 7319 } 7320 } else { 7321 // x86-32: i8, i16, i32 7322 // x86-64: i8, i16, i32, i64 7323 switch (RMW->getOp()) { 7324 default: 7325 // TODO(stichnot): Implement other arithmetic operators. 7326 break; 7327 case InstArithmetic::Add: 7328 Src = legalize(Src, Legal_Reg | Legal_Imm); 7329 _add_rmw(Addr, Src); 7330 return; 7331 case InstArithmetic::Sub: 7332 Src = legalize(Src, Legal_Reg | Legal_Imm); 7333 _sub_rmw(Addr, Src); 7334 return; 7335 case InstArithmetic::And: 7336 Src = legalize(Src, Legal_Reg | Legal_Imm); 7337 _and_rmw(Addr, Src); 7338 return; 7339 case InstArithmetic::Or: 7340 Src = legalize(Src, Legal_Reg | Legal_Imm); 7341 _or_rmw(Addr, Src); 7342 return; 7343 case InstArithmetic::Xor: 7344 Src = legalize(Src, Legal_Reg | Legal_Imm); 7345 _xor_rmw(Addr, Src); 7346 return; 7347 } 7348 } 7349 llvm::report_fatal_error("Couldn't lower RMW instruction"); 7350 } 7351 7352 template <typename TraitsType> 7353 void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) { 7354 if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) { 7355 lowerRMW(RMW); 7356 } else { 7357 TargetLowering::lowerOther(Instr); 7358 } 7359 } 7360 7361 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve 7362 /// integrity of liveness analysis. Undef values are also turned into zeroes, 7363 /// since loOperand() and hiOperand() don't expect Undef input. Also, in 7364 /// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand. 7365 template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() { 7366 if (getFlags().getUseNonsfi()) { 7367 assert(RebasePtr); 7368 CfgNode *Node = Context.getNode(); 7369 uint32_t RebasePtrUseCount = 0; 7370 for (Inst &I : Node->getPhis()) { 7371 auto *Phi = llvm::dyn_cast<InstPhi>(&I); 7372 if (Phi->isDeleted()) 7373 continue; 7374 for (SizeT I = 0; I < Phi->getSrcSize(); ++I) { 7375 Operand *Src = Phi->getSrc(I); 7376 // TODO(stichnot): This over-counts for +0.0, and under-counts for other 7377 // kinds of pooling. 7378 if (llvm::isa<ConstantRelocatable>(Src) || 7379 llvm::isa<ConstantFloat>(Src) || llvm::isa<ConstantDouble>(Src)) { 7380 ++RebasePtrUseCount; 7381 } 7382 } 7383 } 7384 if (RebasePtrUseCount) { 7385 Node->getInsts().push_front(InstFakeUse::create(Func, RebasePtr)); 7386 } 7387 } 7388 if (Traits::Is64Bit) { 7389 // On x86-64 we don't need to prelower phis -- the architecture can handle 7390 // 64-bit integer natively. 7391 return; 7392 } 7393 7394 PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>( 7395 this, Context.getNode(), Func); 7396 } 7397 7398 template <typename TraitsType> 7399 void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) { 7400 uint32_t StackArgumentsSize = 0; 7401 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) { 7402 RuntimeHelper HelperID = RuntimeHelper::H_Num; 7403 Variable *Dest = Arith->getDest(); 7404 Type DestTy = Dest->getType(); 7405 if (!Traits::Is64Bit && DestTy == IceType_i64) { 7406 switch (Arith->getOp()) { 7407 default: 7408 return; 7409 case InstArithmetic::Udiv: 7410 HelperID = RuntimeHelper::H_udiv_i64; 7411 break; 7412 case InstArithmetic::Sdiv: 7413 HelperID = RuntimeHelper::H_sdiv_i64; 7414 break; 7415 case InstArithmetic::Urem: 7416 HelperID = RuntimeHelper::H_urem_i64; 7417 break; 7418 case InstArithmetic::Srem: 7419 HelperID = RuntimeHelper::H_srem_i64; 7420 break; 7421 } 7422 } else if (isVectorType(DestTy)) { 7423 Variable *Dest = Arith->getDest(); 7424 Operand *Src0 = Arith->getSrc(0); 7425 Operand *Src1 = Arith->getSrc(1); 7426 switch (Arith->getOp()) { 7427 default: 7428 return; 7429 case InstArithmetic::Mul: 7430 if (DestTy == IceType_v16i8) { 7431 scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1); 7432 Arith->setDeleted(); 7433 } 7434 return; 7435 case InstArithmetic::Shl: 7436 case InstArithmetic::Lshr: 7437 case InstArithmetic::Ashr: 7438 if (llvm::isa<Constant>(Src1)) { 7439 return; 7440 } 7441 case InstArithmetic::Udiv: 7442 case InstArithmetic::Urem: 7443 case InstArithmetic::Sdiv: 7444 case InstArithmetic::Srem: 7445 case InstArithmetic::Frem: 7446 scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1); 7447 Arith->setDeleted(); 7448 return; 7449 } 7450 } else { 7451 switch (Arith->getOp()) { 7452 default: 7453 return; 7454 case InstArithmetic::Frem: 7455 if (isFloat32Asserting32Or64(DestTy)) 7456 HelperID = RuntimeHelper::H_frem_f32; 7457 else 7458 HelperID = RuntimeHelper::H_frem_f64; 7459 } 7460 } 7461 constexpr SizeT MaxSrcs = 2; 7462 InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs); 7463 Call->addArg(Arith->getSrc(0)); 7464 Call->addArg(Arith->getSrc(1)); 7465 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call); 7466 Context.insert(Call); 7467 Arith->setDeleted(); 7468 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) { 7469 InstCast::OpKind CastKind = Cast->getCastKind(); 7470 Operand *Src0 = Cast->getSrc(0); 7471 const Type SrcType = Src0->getType(); 7472 Variable *Dest = Cast->getDest(); 7473 const Type DestTy = Dest->getType(); 7474 RuntimeHelper HelperID = RuntimeHelper::H_Num; 7475 Variable *CallDest = Dest; 7476 switch (CastKind) { 7477 default: 7478 return; 7479 case InstCast::Fptosi: 7480 if (!Traits::Is64Bit && DestTy == IceType_i64) { 7481 HelperID = isFloat32Asserting32Or64(SrcType) 7482 ? RuntimeHelper::H_fptosi_f32_i64 7483 : RuntimeHelper::H_fptosi_f64_i64; 7484 } else { 7485 return; 7486 } 7487 break; 7488 case InstCast::Fptoui: 7489 if (isVectorType(DestTy)) { 7490 assert(DestTy == IceType_v4i32); 7491 assert(SrcType == IceType_v4f32); 7492 HelperID = RuntimeHelper::H_fptoui_4xi32_f32; 7493 } else if (DestTy == IceType_i64 || 7494 (!Traits::Is64Bit && DestTy == IceType_i32)) { 7495 if (Traits::Is64Bit) { 7496 HelperID = isFloat32Asserting32Or64(SrcType) 7497 ? RuntimeHelper::H_fptoui_f32_i64 7498 : RuntimeHelper::H_fptoui_f64_i64; 7499 } else if (isInt32Asserting32Or64(DestTy)) { 7500 HelperID = isFloat32Asserting32Or64(SrcType) 7501 ? RuntimeHelper::H_fptoui_f32_i32 7502 : RuntimeHelper::H_fptoui_f64_i32; 7503 } else { 7504 HelperID = isFloat32Asserting32Or64(SrcType) 7505 ? RuntimeHelper::H_fptoui_f32_i64 7506 : RuntimeHelper::H_fptoui_f64_i64; 7507 } 7508 } else { 7509 return; 7510 } 7511 break; 7512 case InstCast::Sitofp: 7513 if (!Traits::Is64Bit && SrcType == IceType_i64) { 7514 HelperID = isFloat32Asserting32Or64(DestTy) 7515 ? RuntimeHelper::H_sitofp_i64_f32 7516 : RuntimeHelper::H_sitofp_i64_f64; 7517 } else { 7518 return; 7519 } 7520 break; 7521 case InstCast::Uitofp: 7522 if (isVectorType(SrcType)) { 7523 assert(DestTy == IceType_v4f32); 7524 assert(SrcType == IceType_v4i32); 7525 HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32; 7526 } else if (SrcType == IceType_i64 || 7527 (!Traits::Is64Bit && SrcType == IceType_i32)) { 7528 if (isInt32Asserting32Or64(SrcType)) { 7529 HelperID = isFloat32Asserting32Or64(DestTy) 7530 ? RuntimeHelper::H_uitofp_i32_f32 7531 : RuntimeHelper::H_uitofp_i32_f64; 7532 } else { 7533 HelperID = isFloat32Asserting32Or64(DestTy) 7534 ? RuntimeHelper::H_uitofp_i64_f32 7535 : RuntimeHelper::H_uitofp_i64_f64; 7536 } 7537 } else { 7538 return; 7539 } 7540 break; 7541 case InstCast::Bitcast: { 7542 if (DestTy == Src0->getType()) 7543 return; 7544 switch (DestTy) { 7545 default: 7546 return; 7547 case IceType_i8: 7548 assert(Src0->getType() == IceType_v8i1); 7549 HelperID = RuntimeHelper::H_bitcast_8xi1_i8; 7550 CallDest = Func->makeVariable(IceType_i32); 7551 break; 7552 case IceType_i16: 7553 assert(Src0->getType() == IceType_v16i1); 7554 HelperID = RuntimeHelper::H_bitcast_16xi1_i16; 7555 CallDest = Func->makeVariable(IceType_i32); 7556 break; 7557 case IceType_v8i1: { 7558 assert(Src0->getType() == IceType_i8); 7559 HelperID = RuntimeHelper::H_bitcast_i8_8xi1; 7560 Variable *Src0AsI32 = Func->makeVariable(stackSlotType()); 7561 // Arguments to functions are required to be at least 32 bits wide. 7562 Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0); 7563 Src0 = Src0AsI32; 7564 } break; 7565 case IceType_v16i1: { 7566 assert(Src0->getType() == IceType_i16); 7567 HelperID = RuntimeHelper::H_bitcast_i16_16xi1; 7568 Variable *Src0AsI32 = Func->makeVariable(stackSlotType()); 7569 // Arguments to functions are required to be at least 32 bits wide. 7570 Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0); 7571 Src0 = Src0AsI32; 7572 } break; 7573 } 7574 } break; 7575 } 7576 constexpr SizeT MaxSrcs = 1; 7577 InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs); 7578 Call->addArg(Src0); 7579 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call); 7580 Context.insert(Call); 7581 // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call 7582 // result to the appropriate type as necessary. 7583 if (CallDest->getType() != Dest->getType()) 7584 Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest); 7585 Cast->setDeleted(); 7586 } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) { 7587 CfgVector<Type> ArgTypes; 7588 Type ReturnType = IceType_void; 7589 switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicID()) { 7590 default: 7591 return; 7592 case Intrinsics::Ctpop: { 7593 Operand *Val = Intrinsic->getArg(0); 7594 Type ValTy = Val->getType(); 7595 if (ValTy == IceType_i64) 7596 ArgTypes = {IceType_i64}; 7597 else 7598 ArgTypes = {IceType_i32}; 7599 ReturnType = IceType_i32; 7600 } break; 7601 case Intrinsics::Longjmp: 7602 ArgTypes = {IceType_i32, IceType_i32}; 7603 ReturnType = IceType_void; 7604 break; 7605 case Intrinsics::Memcpy: 7606 ArgTypes = {IceType_i32, IceType_i32, IceType_i32}; 7607 ReturnType = IceType_void; 7608 break; 7609 case Intrinsics::Memmove: 7610 ArgTypes = {IceType_i32, IceType_i32, IceType_i32}; 7611 ReturnType = IceType_void; 7612 break; 7613 case Intrinsics::Memset: 7614 ArgTypes = {IceType_i32, IceType_i32, IceType_i32}; 7615 ReturnType = IceType_void; 7616 break; 7617 case Intrinsics::NaClReadTP: 7618 ReturnType = IceType_i32; 7619 break; 7620 case Intrinsics::Setjmp: 7621 ArgTypes = {IceType_i32}; 7622 ReturnType = IceType_i32; 7623 break; 7624 } 7625 StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType); 7626 } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) { 7627 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call); 7628 } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) { 7629 if (!Ret->hasRetValue()) 7630 return; 7631 Operand *RetValue = Ret->getRetValue(); 7632 Type ReturnType = RetValue->getType(); 7633 if (!isScalarFloatingType(ReturnType)) 7634 return; 7635 StackArgumentsSize = typeWidthInBytes(ReturnType); 7636 } else { 7637 return; 7638 } 7639 StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize); 7640 updateMaxOutArgsSizeBytes(StackArgumentsSize); 7641 } 7642 7643 template <typename TraitsType> 7644 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes( 7645 const CfgVector<Type> &ArgTypes, Type ReturnType) { 7646 uint32_t OutArgumentsSizeBytes = 0; 7647 uint32_t XmmArgCount = 0; 7648 uint32_t GprArgCount = 0; 7649 for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) { 7650 Type Ty = ArgTypes[i]; 7651 // The PNaCl ABI requires the width of arguments to be at least 32 bits. 7652 assert(typeWidthInBytes(Ty) >= 4); 7653 if (isVectorType(Ty) && 7654 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgCount)) 7655 .hasValue()) { 7656 ++XmmArgCount; 7657 } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM && 7658 Traits::getRegisterForXmmArgNum( 7659 Traits::getArgIndex(i, XmmArgCount)) 7660 .hasValue()) { 7661 ++XmmArgCount; 7662 } else if (isScalarIntegerType(Ty) && 7663 Traits::getRegisterForGprArgNum( 7664 Ty, Traits::getArgIndex(i, GprArgCount)) 7665 .hasValue()) { 7666 // The 64 bit ABI allows some integers to be passed in GPRs. 7667 ++GprArgCount; 7668 } else { 7669 if (isVectorType(Ty)) { 7670 OutArgumentsSizeBytes = 7671 Traits::applyStackAlignment(OutArgumentsSizeBytes); 7672 } 7673 OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty); 7674 } 7675 } 7676 if (Traits::Is64Bit) 7677 return OutArgumentsSizeBytes; 7678 // The 32 bit ABI requires floating point values to be returned on the x87 FP 7679 // stack. Ensure there is enough space for the fstp/movs for floating returns. 7680 if (isScalarFloatingType(ReturnType)) { 7681 OutArgumentsSizeBytes = 7682 std::max(OutArgumentsSizeBytes, 7683 static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType))); 7684 } 7685 return OutArgumentsSizeBytes; 7686 } 7687 7688 template <typename TraitsType> 7689 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes( 7690 const InstCall *Instr) { 7691 // Build a vector of the arguments' types. 7692 const SizeT NumArgs = Instr->getNumArgs(); 7693 CfgVector<Type> ArgTypes; 7694 ArgTypes.reserve(NumArgs); 7695 for (SizeT i = 0; i < NumArgs; ++i) { 7696 Operand *Arg = Instr->getArg(i); 7697 ArgTypes.emplace_back(Arg->getType()); 7698 } 7699 // Compute the return type (if any); 7700 Type ReturnType = IceType_void; 7701 Variable *Dest = Instr->getDest(); 7702 if (Dest != nullptr) 7703 ReturnType = Dest->getType(); 7704 return getShadowStoreSize<Traits>() + 7705 getCallStackArgumentsSizeBytes(ArgTypes, ReturnType); 7706 } 7707 7708 template <typename TraitsType> 7709 Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty, 7710 RegNumT RegNum) { 7711 Variable *Reg = makeReg(Ty, RegNum); 7712 switch (Ty) { 7713 case IceType_i1: 7714 case IceType_i8: 7715 case IceType_i16: 7716 case IceType_i32: 7717 case IceType_i64: 7718 // Conservatively do "mov reg, 0" to avoid modifying FLAGS. 7719 _mov(Reg, Ctx->getConstantZero(Ty)); 7720 break; 7721 case IceType_f32: 7722 case IceType_f64: 7723 Context.insert<InstFakeDef>(Reg); 7724 _xorps(Reg, Reg); 7725 break; 7726 default: 7727 // All vector types use the same pxor instruction. 7728 assert(isVectorType(Ty)); 7729 Context.insert<InstFakeDef>(Reg); 7730 _pxor(Reg, Reg); 7731 break; 7732 } 7733 return Reg; 7734 } 7735 7736 // There is no support for loading or emitting vector constants, so the vector 7737 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are 7738 // initialized with register operations. 7739 // 7740 // TODO(wala): Add limited support for vector constants so that complex 7741 // initialization in registers is unnecessary. 7742 7743 template <typename TraitsType> 7744 Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty, 7745 RegNumT RegNum) { 7746 return makeZeroedRegister(Ty, RegNum); 7747 } 7748 7749 template <typename TraitsType> 7750 Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty, 7751 RegNumT RegNum) { 7752 Variable *MinusOnes = makeReg(Ty, RegNum); 7753 // Insert a FakeDef so the live range of MinusOnes is not overestimated. 7754 Context.insert<InstFakeDef>(MinusOnes); 7755 if (Ty == IceType_f64) 7756 // Making a vector of minus ones of type f64 is currently only used for the 7757 // fabs intrinsic. To use the f64 type to create this mask with pcmpeqq 7758 // requires SSE 4.1. Since we're just creating a mask, pcmpeqd does the 7759 // same job and only requires SSE2. 7760 _pcmpeq(MinusOnes, MinusOnes, IceType_f32); 7761 else 7762 _pcmpeq(MinusOnes, MinusOnes); 7763 return MinusOnes; 7764 } 7765 7766 template <typename TraitsType> 7767 Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) { 7768 Variable *Dest = makeVectorOfZeros(Ty, RegNum); 7769 Variable *MinusOne = makeVectorOfMinusOnes(Ty); 7770 _psub(Dest, MinusOne); 7771 return Dest; 7772 } 7773 7774 template <typename TraitsType> 7775 Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty, 7776 RegNumT RegNum) { 7777 assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 || 7778 Ty == IceType_v16i8); 7779 if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) { 7780 Variable *Reg = makeVectorOfOnes(Ty, RegNum); 7781 SizeT Shift = 7782 typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1; 7783 _psll(Reg, Ctx->getConstantInt8(Shift)); 7784 return Reg; 7785 } else { 7786 // SSE has no left shift operation for vectors of 8 bit integers. 7787 constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080; 7788 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK); 7789 Variable *Reg = makeReg(Ty, RegNum); 7790 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem)); 7791 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8)); 7792 return Reg; 7793 } 7794 } 7795 7796 /// Construct a mask in a register that can be and'ed with a floating-point 7797 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32 7798 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of 7799 /// ones logically right shifted one bit. 7800 // TODO(stichnot): Fix the wala 7801 // TODO: above, to represent vector constants in memory. 7802 template <typename TraitsType> 7803 Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty, 7804 RegNumT RegNum) { 7805 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum); 7806 _psrl(Reg, Ctx->getConstantInt8(1)); 7807 return Reg; 7808 } 7809 7810 template <typename TraitsType> 7811 typename TargetX86Base<TraitsType>::X86OperandMem * 7812 TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot, 7813 uint32_t Offset) { 7814 // Ensure that Loc is a stack slot. 7815 assert(Slot->mustNotHaveReg()); 7816 assert(Slot->getRegNum().hasNoValue()); 7817 // Compute the location of Loc in memory. 7818 // TODO(wala,stichnot): lea should not 7819 // be required. The address of the stack slot is known at compile time 7820 // (although not until after addProlog()). 7821 const Type PointerType = getPointerType(); 7822 Variable *Loc = makeReg(PointerType); 7823 _lea(Loc, Slot); 7824 Constant *ConstantOffset = Ctx->getConstantInt32(Offset); 7825 return X86OperandMem::create(Func, Ty, Loc, ConstantOffset); 7826 } 7827 7828 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR. 7829 /// Src is assumed to already be legalized. If the source operand is known to 7830 /// be a memory or immediate operand, a simple mov will suffice. But if the 7831 /// source operand can be a physical register, then it must first be copied into 7832 /// a physical register that is truncable to 8-bit, then truncated into a 7833 /// physical register that can receive a truncation, and finally copied into the 7834 /// result 8-bit register (which in general can be any 8-bit register). For 7835 /// example, moving %ebp into %ah may be accomplished as: 7836 /// movl %ebp, %edx 7837 /// mov_trunc %edx, %dl // this redundant assignment is ultimately elided 7838 /// movb %dl, %ah 7839 /// On the other hand, moving a memory or immediate operand into ah: 7840 /// movb 4(%ebp), %ah 7841 /// movb $my_imm, %ah 7842 /// 7843 /// Note #1. On a 64-bit target, the "movb 4(%ebp), %ah" is likely not 7844 /// encodable, so RegNum=Reg_ah should NOT be given as an argument. Instead, 7845 /// use RegNum=RegNumT() and then let the caller do a separate copy into 7846 /// Reg_ah. 7847 /// 7848 /// Note #2. ConstantRelocatable operands are also put through this process 7849 /// (not truncated directly) because our ELF emitter does R_386_32 relocations 7850 /// but not R_386_8 relocations. 7851 /// 7852 /// Note #3. If Src is a Variable, the result will be an infinite-weight i8 7853 /// Variable with the RCX86_IsTrunc8Rcvr register class. As such, this helper 7854 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument 7855 /// to the pinsrb instruction. 7856 template <typename TraitsType> 7857 Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) { 7858 Type Ty = Src->getType(); 7859 assert(isScalarIntegerType(Ty)); 7860 assert(Ty != IceType_i1); 7861 Variable *Reg = makeReg(IceType_i8, RegNum); 7862 Reg->setRegClass(RCX86_IsTrunc8Rcvr); 7863 if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) { 7864 Variable *SrcTruncable = makeReg(Ty); 7865 switch (Ty) { 7866 case IceType_i64: 7867 SrcTruncable->setRegClass(RCX86_Is64To8); 7868 break; 7869 case IceType_i32: 7870 SrcTruncable->setRegClass(RCX86_Is32To8); 7871 break; 7872 case IceType_i16: 7873 SrcTruncable->setRegClass(RCX86_Is16To8); 7874 break; 7875 default: 7876 // i8 - just use default register class 7877 break; 7878 } 7879 Variable *SrcRcvr = makeReg(IceType_i8); 7880 SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr); 7881 _mov(SrcTruncable, Src); 7882 _mov(SrcRcvr, SrcTruncable); 7883 Src = SrcRcvr; 7884 } 7885 _mov(Reg, Src); 7886 return Reg; 7887 } 7888 7889 /// Helper for legalize() to emit the right code to lower an operand to a 7890 /// register of the appropriate type. 7891 template <typename TraitsType> 7892 Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) { 7893 Type Ty = Src->getType(); 7894 Variable *Reg = makeReg(Ty, RegNum); 7895 if (isVectorType(Ty)) { 7896 _movp(Reg, Src); 7897 } else { 7898 _mov(Reg, Src); 7899 } 7900 return Reg; 7901 } 7902 7903 template <typename TraitsType> 7904 Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed, 7905 RegNumT RegNum) { 7906 const bool UseNonsfi = getFlags().getUseNonsfi(); 7907 const Type Ty = From->getType(); 7908 // Assert that a physical register is allowed. To date, all calls to 7909 // legalize() allow a physical register. If a physical register needs to be 7910 // explicitly disallowed, then new code will need to be written to force a 7911 // spill. 7912 assert(Allowed & Legal_Reg); 7913 // If we're asking for a specific physical register, make sure we're not 7914 // allowing any other operand kinds. (This could be future work, e.g. allow 7915 // the shl shift amount to be either an immediate or in ecx.) 7916 assert(RegNum.hasNoValue() || Allowed == Legal_Reg); 7917 7918 // Substitute with an available infinite-weight variable if possible. Only do 7919 // this when we are not asking for a specific register, and when the 7920 // substitution is not locked to a specific register, and when the types 7921 // match, in order to capture the vast majority of opportunities and avoid 7922 // corner cases in the lowering. 7923 if (RegNum.hasNoValue()) { 7924 if (Variable *Subst = getContext().availabilityGet(From)) { 7925 // At this point we know there is a potential substitution available. 7926 if (Subst->mustHaveReg() && !Subst->hasReg()) { 7927 // At this point we know the substitution will have a register. 7928 if (From->getType() == Subst->getType()) { 7929 // At this point we know the substitution's register is compatible. 7930 return Subst; 7931 } 7932 } 7933 } 7934 } 7935 7936 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) { 7937 // Before doing anything with a Mem operand, we need to ensure that the 7938 // Base and Index components are in physical registers. 7939 Variable *Base = Mem->getBase(); 7940 Variable *Index = Mem->getIndex(); 7941 Constant *Offset = Mem->getOffset(); 7942 Variable *RegBase = nullptr; 7943 Variable *RegIndex = nullptr; 7944 uint16_t Shift = Mem->getShift(); 7945 if (Base) { 7946 RegBase = llvm::cast<Variable>( 7947 legalize(Base, Legal_Reg | Legal_Rematerializable)); 7948 } 7949 if (Index) { 7950 // TODO(jpp): perhaps we should only allow Legal_Reg if 7951 // Base->isRematerializable. 7952 RegIndex = llvm::cast<Variable>( 7953 legalize(Index, Legal_Reg | Legal_Rematerializable)); 7954 } 7955 7956 if (Base != RegBase || Index != RegIndex) { 7957 Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift, 7958 Mem->getSegmentRegister()); 7959 } 7960 7961 From = Mem; 7962 7963 if (!(Allowed & Legal_Mem)) { 7964 From = copyToReg(From, RegNum); 7965 } 7966 return From; 7967 } 7968 7969 if (auto *Const = llvm::dyn_cast<Constant>(From)) { 7970 if (llvm::isa<ConstantUndef>(Const)) { 7971 From = legalizeUndef(Const, RegNum); 7972 if (isVectorType(Ty)) 7973 return From; 7974 Const = llvm::cast<Constant>(From); 7975 } 7976 // There should be no constants of vector type (other than undef). 7977 assert(!isVectorType(Ty)); 7978 7979 // If the operand is a 64 bit constant integer we need to legalize it to a 7980 // register in x86-64. 7981 if (Traits::Is64Bit) { 7982 if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) { 7983 if (!Utils::IsInt(32, C64->getValue())) { 7984 if (RegNum.hasValue()) { 7985 assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum); 7986 } 7987 return copyToReg(Const, RegNum); 7988 } 7989 } 7990 } 7991 7992 if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Const)) { 7993 // If the operand is a ConstantRelocatable, and Legal_AddrAbs is not 7994 // specified, and UseNonsfi is indicated, we need to add RebasePtr. 7995 if (UseNonsfi && !(Allowed & Legal_AddrAbs)) { 7996 assert(Ty == IceType_i32); 7997 Variable *NewVar = makeReg(Ty, RegNum); 7998 auto *Mem = Traits::X86OperandMem::create(Func, Ty, nullptr, CR); 7999 // LEAs are not automatically sandboxed, thus we explicitly invoke 8000 // _sandbox_mem_reference. 8001 _lea(NewVar, _sandbox_mem_reference(Mem)); 8002 From = NewVar; 8003 } 8004 } else if (isScalarFloatingType(Ty)) { 8005 // Convert a scalar floating point constant into an explicit memory 8006 // operand. 8007 if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) { 8008 if (Utils::isPositiveZero(ConstFloat->getValue())) 8009 return makeZeroedRegister(Ty, RegNum); 8010 } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) { 8011 if (Utils::isPositiveZero(ConstDouble->getValue())) 8012 return makeZeroedRegister(Ty, RegNum); 8013 } 8014 8015 auto *CFrom = llvm::cast<Constant>(From); 8016 assert(CFrom->getShouldBePooled()); 8017 Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName()); 8018 auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset); 8019 From = Mem; 8020 } 8021 8022 bool NeedsReg = false; 8023 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty)) 8024 // Immediate specifically not allowed. 8025 NeedsReg = true; 8026 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty)) 8027 // On x86, FP constants are lowered to mem operands. 8028 NeedsReg = true; 8029 if (NeedsReg) { 8030 From = copyToReg(From, RegNum); 8031 } 8032 return From; 8033 } 8034 8035 if (auto *Var = llvm::dyn_cast<Variable>(From)) { 8036 // Check if the variable is guaranteed a physical register. This can happen 8037 // either when the variable is pre-colored or when it is assigned infinite 8038 // weight. 8039 bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg()); 8040 bool MustRematerialize = 8041 (Var->isRematerializable() && !(Allowed & Legal_Rematerializable)); 8042 // We need a new physical register for the operand if: 8043 // - Mem is not allowed and Var isn't guaranteed a physical register, or 8044 // - RegNum is required and Var->getRegNum() doesn't match, or 8045 // - Var is a rematerializable variable and rematerializable pass-through is 8046 // not allowed (in which case we need a lea instruction). 8047 if (MustRematerialize) { 8048 Variable *NewVar = makeReg(Ty, RegNum); 8049 // Since Var is rematerializable, the offset will be added when the lea is 8050 // emitted. 8051 constexpr Constant *NoOffset = nullptr; 8052 auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset); 8053 _lea(NewVar, Mem); 8054 From = NewVar; 8055 } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) || 8056 (RegNum.hasValue() && RegNum != Var->getRegNum())) { 8057 From = copyToReg(From, RegNum); 8058 } 8059 return From; 8060 } 8061 8062 llvm::report_fatal_error("Unhandled operand kind in legalize()"); 8063 return From; 8064 } 8065 8066 /// Provide a trivial wrapper to legalize() for this common usage. 8067 template <typename TraitsType> 8068 Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From, 8069 RegNumT RegNum) { 8070 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum)); 8071 } 8072 8073 /// Legalize undef values to concrete values. 8074 template <typename TraitsType> 8075 Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From, 8076 RegNumT RegNum) { 8077 Type Ty = From->getType(); 8078 if (llvm::isa<ConstantUndef>(From)) { 8079 // Lower undefs to zero. Another option is to lower undefs to an 8080 // uninitialized register; however, using an uninitialized register results 8081 // in less predictable code. 8082 // 8083 // If in the future the implementation is changed to lower undef values to 8084 // uninitialized registers, a FakeDef will be needed: 8085 // Context.insert<InstFakeDef>(Reg); 8086 // This is in order to ensure that the live range of Reg is not 8087 // overestimated. If the constant being lowered is a 64 bit value, then 8088 // the result should be split and the lo and hi components will need to go 8089 // in uninitialized registers. 8090 if (isVectorType(Ty)) 8091 return makeVectorOfZeros(Ty, RegNum); 8092 return Ctx->getConstantZero(Ty); 8093 } 8094 return From; 8095 } 8096 8097 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical 8098 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be 8099 /// copied into a physical register. (Actually, either Src0 or Src1 can be 8100 /// chosen for the physical register, but unfortunately we have to commit to one 8101 /// or the other before register allocation.) 8102 template <typename TraitsType> 8103 Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0, 8104 Operand *Src1) { 8105 bool IsSrc1ImmOrReg = false; 8106 if (llvm::isa<Constant>(Src1)) { 8107 IsSrc1ImmOrReg = true; 8108 } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) { 8109 if (Var->hasReg()) 8110 IsSrc1ImmOrReg = true; 8111 } 8112 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg); 8113 } 8114 8115 template <typename TraitsType> 8116 typename TargetX86Base<TraitsType>::X86OperandMem * 8117 TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty, 8118 bool DoLegalize) { 8119 auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd); 8120 // It may be the case that address mode optimization already creates an 8121 // X86OperandMem, so in that case it wouldn't need another level of 8122 // transformation. 8123 if (!Mem) { 8124 auto *Base = llvm::dyn_cast<Variable>(Opnd); 8125 auto *Offset = llvm::dyn_cast<Constant>(Opnd); 8126 assert(Base || Offset); 8127 if (Offset) { 8128 if (!llvm::isa<ConstantRelocatable>(Offset)) { 8129 if (llvm::isa<ConstantInteger64>(Offset)) { 8130 // Memory operands cannot have 64-bit immediates, so they must be 8131 // legalized into a register only. 8132 Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg)); 8133 Offset = nullptr; 8134 } else { 8135 Offset = llvm::cast<Constant>(legalize(Offset)); 8136 8137 assert(llvm::isa<ConstantInteger32>(Offset) || 8138 llvm::isa<ConstantRelocatable>(Offset)); 8139 } 8140 } 8141 } 8142 Mem = X86OperandMem::create(Func, Ty, Base, Offset); 8143 } 8144 return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem); 8145 } 8146 8147 template <typename TraitsType> 8148 Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) { 8149 // There aren't any 64-bit integer registers for x86-32. 8150 assert(Traits::Is64Bit || Type != IceType_i64); 8151 Variable *Reg = Func->makeVariable(Type); 8152 if (RegNum.hasValue()) 8153 Reg->setRegNum(RegNum); 8154 else 8155 Reg->setMustHaveReg(); 8156 return Reg; 8157 } 8158 8159 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64, 8160 IceType_v16i8}; 8161 8162 template <typename TraitsType> 8163 Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size, 8164 uint32_t MaxSize) { 8165 assert(Size != 0); 8166 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); 8167 uint32_t MaxIndex = MaxSize == NoSizeLimit 8168 ? llvm::array_lengthof(TypeForSize) - 1 8169 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined); 8170 return TypeForSize[std::min(TyIndex, MaxIndex)]; 8171 } 8172 8173 template <typename TraitsType> 8174 Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size, 8175 uint32_t MaxSize) { 8176 assert(Size != 0); 8177 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); 8178 if (!llvm::isPowerOf2_32(Size)) 8179 ++TyIndex; 8180 uint32_t MaxIndex = MaxSize == NoSizeLimit 8181 ? llvm::array_lengthof(TypeForSize) - 1 8182 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined); 8183 return TypeForSize[std::min(TyIndex, MaxIndex)]; 8184 } 8185 8186 template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() { 8187 if (Func->getOptLevel() == Opt_m1) 8188 return; 8189 markRedefinitions(); 8190 Context.availabilityUpdate(); 8191 } 8192 8193 template <typename TraitsType> 8194 void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const { 8195 if (!BuildDefs::dump()) 8196 return; 8197 Ostream &Str = Ctx->getStrEmit(); 8198 Str << "$" << C->getValue(); 8199 } 8200 8201 template <typename TraitsType> 8202 void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const { 8203 if (!Traits::Is64Bit) { 8204 llvm::report_fatal_error("Not expecting to emit 64-bit integers"); 8205 } else { 8206 if (!BuildDefs::dump()) 8207 return; 8208 Ostream &Str = Ctx->getStrEmit(); 8209 Str << "$" << C->getValue(); 8210 } 8211 } 8212 8213 template <typename TraitsType> 8214 void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const { 8215 if (!BuildDefs::dump()) 8216 return; 8217 Ostream &Str = Ctx->getStrEmit(); 8218 Str << C->getLabelName(); 8219 } 8220 8221 template <typename TraitsType> 8222 void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const { 8223 if (!BuildDefs::dump()) 8224 return; 8225 Ostream &Str = Ctx->getStrEmit(); 8226 Str << C->getLabelName(); 8227 } 8228 8229 template <typename TraitsType> 8230 void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const { 8231 llvm::report_fatal_error("undef value encountered by emitter."); 8232 } 8233 8234 template <class Machine> 8235 void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const { 8236 if (!BuildDefs::dump()) 8237 return; 8238 assert(!getFlags().getUseNonsfi() || 8239 C->getName().toString() == GlobalOffsetTable); 8240 Ostream &Str = Ctx->getStrEmit(); 8241 Str << "$"; 8242 emitWithoutPrefix(C); 8243 } 8244 8245 template <typename TraitsType> 8246 void TargetX86Base<TraitsType>::emitJumpTable( 8247 const Cfg *, const InstJumpTable *JumpTable) const { 8248 if (!BuildDefs::dump()) 8249 return; 8250 Ostream &Str = Ctx->getStrEmit(); 8251 const bool UseNonsfi = getFlags().getUseNonsfi(); 8252 const char *Prefix = UseNonsfi ? ".data.rel.ro." : ".rodata."; 8253 Str << "\t.section\t" << Prefix << JumpTable->getSectionName() 8254 << ",\"a\",@progbits\n" 8255 "\t.align\t" 8256 << typeWidthInBytes(getPointerType()) << "\n" 8257 << JumpTable->getName() << ":"; 8258 8259 // On X86 ILP32 pointers are 32-bit hence the use of .long 8260 for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I) 8261 Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName(); 8262 Str << "\n"; 8263 } 8264 8265 template <typename TraitsType> 8266 template <typename T> 8267 void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) { 8268 if (!BuildDefs::dump()) 8269 return; 8270 Ostream &Str = Ctx->getStrEmit(); 8271 Type Ty = T::Ty; 8272 SizeT Align = typeAlignInBytes(Ty); 8273 ConstantList Pool = Ctx->getConstantPool(Ty); 8274 8275 Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align 8276 << "\n"; 8277 Str << "\t.align\t" << Align << "\n"; 8278 8279 for (Constant *C : Pool) { 8280 if (!C->getShouldBePooled()) 8281 continue; 8282 auto *Const = llvm::cast<typename T::IceType>(C); 8283 typename T::IceType::PrimType Value = Const->getValue(); 8284 // Use memcpy() to copy bits from Value into RawValue in a way that avoids 8285 // breaking strict-aliasing rules. 8286 typename T::PrimitiveIntType RawValue; 8287 memcpy(&RawValue, &Value, sizeof(Value)); 8288 char buf[30]; 8289 int CharsPrinted = 8290 snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue); 8291 assert(CharsPrinted >= 0); 8292 assert((size_t)CharsPrinted < llvm::array_lengthof(buf)); 8293 (void)CharsPrinted; // avoid warnings if asserts are disabled 8294 Str << Const->getLabelName(); 8295 Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " " 8296 << Value << " */\n"; 8297 } 8298 } 8299 8300 template <typename TraitsType> 8301 void TargetDataX86<TraitsType>::lowerConstants() { 8302 if (getFlags().getDisableTranslation()) 8303 return; 8304 switch (getFlags().getOutFileType()) { 8305 case FT_Elf: { 8306 ELFObjectWriter *Writer = Ctx->getObjectWriter(); 8307 8308 Writer->writeConstantPool<ConstantInteger32>(IceType_i8); 8309 Writer->writeConstantPool<ConstantInteger32>(IceType_i16); 8310 Writer->writeConstantPool<ConstantInteger32>(IceType_i32); 8311 8312 Writer->writeConstantPool<ConstantFloat>(IceType_f32); 8313 Writer->writeConstantPool<ConstantDouble>(IceType_f64); 8314 } break; 8315 case FT_Asm: 8316 case FT_Iasm: { 8317 OstreamLocker L(Ctx); 8318 8319 emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx); 8320 emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx); 8321 emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx); 8322 8323 emitConstantPool<PoolTypeConverter<float>>(Ctx); 8324 emitConstantPool<PoolTypeConverter<double>>(Ctx); 8325 } break; 8326 } 8327 } 8328 8329 template <typename TraitsType> 8330 void TargetDataX86<TraitsType>::lowerJumpTables() { 8331 const bool IsPIC = getFlags().getUseNonsfi(); 8332 switch (getFlags().getOutFileType()) { 8333 case FT_Elf: { 8334 ELFObjectWriter *Writer = Ctx->getObjectWriter(); 8335 constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64; 8336 const FixupKind RelocationKind = 8337 (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64; 8338 for (const JumpTableData &JT : Ctx->getJumpTables()) 8339 Writer->writeJumpTable(JT, RelocationKind, IsPIC); 8340 } break; 8341 case FT_Asm: 8342 // Already emitted from Cfg 8343 break; 8344 case FT_Iasm: { 8345 if (!BuildDefs::dump()) 8346 return; 8347 Ostream &Str = Ctx->getStrEmit(); 8348 const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata."; 8349 for (const JumpTableData &JT : Ctx->getJumpTables()) { 8350 Str << "\t.section\t" << Prefix << JT.getSectionName() 8351 << ",\"a\",@progbits\n" 8352 "\t.align\t" 8353 << typeWidthInBytes(getPointerType()) << "\n" 8354 << JT.getName().toString() << ":"; 8355 8356 // On X8664 ILP32 pointers are 32-bit hence the use of .long 8357 for (intptr_t TargetOffset : JT.getTargetOffsets()) 8358 Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset; 8359 Str << "\n"; 8360 } 8361 } break; 8362 } 8363 } 8364 8365 template <typename TraitsType> 8366 void TargetDataX86<TraitsType>::lowerGlobals( 8367 const VariableDeclarationList &Vars, const std::string &SectionSuffix) { 8368 const bool IsPIC = getFlags().getUseNonsfi(); 8369 switch (getFlags().getOutFileType()) { 8370 case FT_Elf: { 8371 ELFObjectWriter *Writer = Ctx->getObjectWriter(); 8372 Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC); 8373 } break; 8374 case FT_Asm: 8375 case FT_Iasm: { 8376 OstreamLocker L(Ctx); 8377 for (const VariableDeclaration *Var : Vars) { 8378 if (getFlags().matchTranslateOnly(Var->getName(), 0)) { 8379 emitGlobal(*Var, SectionSuffix); 8380 } 8381 } 8382 } break; 8383 } 8384 } 8385 } // end of namespace X86NAMESPACE 8386 } // end of namespace Ice 8387 8388 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 8389