1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "Debug.hpp"
16 #include "EmulatedIntrinsics.hpp"
17 #include "OptimalIntrinsics.hpp"
18 #include "Print.hpp"
19 #include "Reactor.hpp"
20 #include "ReactorDebugInfo.hpp"
21
22 #include "ExecutableMemory.hpp"
23 #include "Optimizer.hpp"
24
25 #include "src/IceCfg.h"
26 #include "src/IceCfgNode.h"
27 #include "src/IceELFObjectWriter.h"
28 #include "src/IceELFStreamer.h"
29 #include "src/IceGlobalContext.h"
30 #include "src/IceGlobalInits.h"
31 #include "src/IceTypes.h"
32
33 #include "llvm/Support/Compiler.h"
34 #include "llvm/Support/FileSystem.h"
35 #include "llvm/Support/ManagedStatic.h"
36 #include "llvm/Support/raw_os_ostream.h"
37
38 #include "marl/event.h"
39
40 #if __has_feature(memory_sanitizer)
41 # include <sanitizer/msan_interface.h>
42 #endif
43
44 #if defined(_WIN32)
45 # ifndef WIN32_LEAN_AND_MEAN
46 # define WIN32_LEAN_AND_MEAN
47 # endif // !WIN32_LEAN_AND_MEAN
48 # ifndef NOMINMAX
49 # define NOMINMAX
50 # endif // !NOMINMAX
51 # include <Windows.h>
52 #endif
53
54 #include <array>
55 #include <iostream>
56 #include <limits>
57 #include <mutex>
58
59 // Subzero utility functions
60 // These functions only accept and return Subzero (Ice) types, and do not access any globals.
61 namespace {
62 namespace sz {
63
createFunction(Ice::GlobalContext * context,Ice::Type returnType,const std::vector<Ice::Type> & paramTypes)64 Ice::Cfg *createFunction(Ice::GlobalContext *context, Ice::Type returnType, const std::vector<Ice::Type> ¶mTypes)
65 {
66 uint32_t sequenceNumber = 0;
67 auto *function = Ice::Cfg::create(context, sequenceNumber).release();
68
69 function->setStackSizeLimit(512 * 1024); // 512 KiB
70
71 Ice::CfgLocalAllocatorScope allocScope{ function };
72
73 for(auto type : paramTypes)
74 {
75 Ice::Variable *arg = function->makeVariable(type);
76 function->addArg(arg);
77 }
78
79 Ice::CfgNode *node = function->makeNode();
80 function->setEntryNode(node);
81
82 return function;
83 }
84
getPointerType(Ice::Type elementType)85 Ice::Type getPointerType(Ice::Type elementType)
86 {
87 if(sizeof(void *) == 8)
88 {
89 return Ice::IceType_i64;
90 }
91 else
92 {
93 return Ice::IceType_i32;
94 }
95 }
96
allocateStackVariable(Ice::Cfg * function,Ice::Type type,int arraySize=0)97 Ice::Variable *allocateStackVariable(Ice::Cfg *function, Ice::Type type, int arraySize = 0)
98 {
99 int typeSize = Ice::typeWidthInBytes(type);
100 int totalSize = typeSize * (arraySize ? arraySize : 1);
101
102 auto bytes = Ice::ConstantInteger32::create(function->getContext(), Ice::IceType_i32, totalSize);
103 auto address = function->makeVariable(getPointerType(type));
104 auto alloca = Ice::InstAlloca::create(function, address, bytes, typeSize); // SRoA depends on the alignment to match the type size.
105 function->getEntryNode()->getInsts().push_front(alloca);
106
107 return address;
108 }
109
getConstantPointer(Ice::GlobalContext * context,void const * ptr)110 Ice::Constant *getConstantPointer(Ice::GlobalContext *context, void const *ptr)
111 {
112 if(sizeof(void *) == 8)
113 {
114 return context->getConstantInt64(reinterpret_cast<intptr_t>(ptr));
115 }
116 else
117 {
118 return context->getConstantInt32(reinterpret_cast<intptr_t>(ptr));
119 }
120 }
121
122 // TODO(amaiorano): remove this prototype once these are moved to separate header/cpp
123 Ice::Variable *createTruncate(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *from, Ice::Type toType);
124
125 // Wrapper for calls on C functions with Ice types
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Type retTy,Ice::Operand * callTarget,const std::vector<Ice::Operand * > & iceArgs,bool isVariadic)126 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Type retTy, Ice::Operand *callTarget, const std::vector<Ice::Operand *> &iceArgs, bool isVariadic)
127 {
128 Ice::Variable *ret = nullptr;
129
130 // Subzero doesn't support boolean return values. Replace with an i32 temporarily,
131 // then truncate result to bool.
132 // TODO(b/151158858): Add support to Subzero's InstCall for bool-returning functions
133 const bool returningBool = (retTy == Ice::IceType_i1);
134 if(returningBool)
135 {
136 ret = function->makeVariable(Ice::IceType_i32);
137 }
138 else if(retTy != Ice::IceType_void)
139 {
140 ret = function->makeVariable(retTy);
141 }
142
143 auto call = Ice::InstCall::create(function, iceArgs.size(), ret, callTarget, false, false, isVariadic);
144 for(auto arg : iceArgs)
145 {
146 call->addArg(arg);
147 }
148
149 basicBlock->appendInst(call);
150
151 if(returningBool)
152 {
153 // Truncate result to bool so that if any (lsb) bits were set, result will be true
154 ret = createTruncate(function, basicBlock, ret, Ice::IceType_i1);
155 }
156
157 return ret;
158 }
159
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Type retTy,void const * fptr,const std::vector<Ice::Operand * > & iceArgs,bool isVariadic)160 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Type retTy, void const *fptr, const std::vector<Ice::Operand *> &iceArgs, bool isVariadic)
161 {
162 Ice::Operand *callTarget = getConstantPointer(function->getContext(), fptr);
163 return Call(function, basicBlock, retTy, callTarget, iceArgs, isVariadic);
164 }
165
166 // Wrapper for calls on C functions with Ice types
167 template<typename Return, typename... CArgs, typename... RArgs>
Call(Ice::Cfg * function,Ice::CfgNode * basicBlock,Return (fptr)(CArgs...),RArgs &&...args)168 Ice::Variable *Call(Ice::Cfg *function, Ice::CfgNode *basicBlock, Return(fptr)(CArgs...), RArgs &&... args)
169 {
170 static_assert(sizeof...(CArgs) == sizeof...(RArgs), "Expected number of args don't match");
171
172 Ice::Type retTy = T(rr::CToReactorT<Return>::type());
173 std::vector<Ice::Operand *> iceArgs{ std::forward<RArgs>(args)... };
174 return Call(function, basicBlock, retTy, reinterpret_cast<void const *>(fptr), iceArgs, false);
175 }
176
createTruncate(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Operand * from,Ice::Type toType)177 Ice::Variable *createTruncate(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *from, Ice::Type toType)
178 {
179 Ice::Variable *to = function->makeVariable(toType);
180 Ice::InstCast *cast = Ice::InstCast::create(function, Ice::InstCast::Trunc, to, from);
181 basicBlock->appendInst(cast);
182 return to;
183 }
184
createLoad(Ice::Cfg * function,Ice::CfgNode * basicBlock,Ice::Operand * ptr,Ice::Type type,unsigned int align)185 Ice::Variable *createLoad(Ice::Cfg *function, Ice::CfgNode *basicBlock, Ice::Operand *ptr, Ice::Type type, unsigned int align)
186 {
187 Ice::Variable *result = function->makeVariable(type);
188 auto load = Ice::InstLoad::create(function, result, ptr, align);
189 basicBlock->appendInst(load);
190
191 return result;
192 }
193
194 } // namespace sz
195 } // namespace
196
197 namespace rr {
198 class ELFMemoryStreamer;
199 class CoroutineGenerator;
200 } // namespace rr
201
202 namespace {
203
204 // Used to automatically invoke llvm_shutdown() when driver is unloaded
205 llvm::llvm_shutdown_obj llvmShutdownObj;
206
207 // Default configuration settings. Must be accessed under mutex lock.
208 std::mutex defaultConfigLock;
defaultConfig()209 rr::Config &defaultConfig()
210 {
211 // This uses a static in a function to avoid the cost of a global static
212 // initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
213 static rr::Config config = rr::Config::Edit()
214 .apply({});
215 return config;
216 }
217
218 Ice::GlobalContext *context = nullptr;
219 Ice::Cfg *function = nullptr;
220 Ice::CfgNode *entryBlock = nullptr;
221 Ice::CfgNode *basicBlockTop = nullptr;
222 Ice::CfgNode *basicBlock = nullptr;
223 Ice::CfgLocalAllocatorScope *allocator = nullptr;
224 rr::ELFMemoryStreamer *routine = nullptr;
225
226 std::mutex codegenMutex;
227
228 Ice::ELFFileStreamer *elfFile = nullptr;
229 Ice::Fdstream *out = nullptr;
230
231 // Coroutine globals
232 rr::Type *coroYieldType = nullptr;
233 std::shared_ptr<rr::CoroutineGenerator> coroGen;
getOrCreateScheduler()234 marl::Scheduler &getOrCreateScheduler()
235 {
236 static auto scheduler = [] {
237 marl::Scheduler::Config cfg;
238 cfg.setWorkerThreadCount(8);
239 return std::make_unique<marl::Scheduler>(cfg);
240 }();
241
242 return *scheduler;
243 }
244
245 rr::Nucleus::OptimizerCallback *optimizerCallback = nullptr;
246
247 } // Anonymous namespace
248
249 namespace {
250
251 #if !defined(__i386__) && defined(_M_IX86)
252 # define __i386__ 1
253 #endif
254
255 #if !defined(__x86_64__) && (defined(_M_AMD64) || defined(_M_X64))
256 # define __x86_64__ 1
257 #endif
258
toIce(rr::Optimization::Level level)259 Ice::OptLevel toIce(rr::Optimization::Level level)
260 {
261 switch(level)
262 {
263 // Note that Opt_0 and Opt_1 are not implemented by Subzero
264 case rr::Optimization::Level::None: return Ice::Opt_m1;
265 case rr::Optimization::Level::Less: return Ice::Opt_m1;
266 case rr::Optimization::Level::Default: return Ice::Opt_2;
267 case rr::Optimization::Level::Aggressive: return Ice::Opt_2;
268 default: UNREACHABLE("Unknown Optimization Level %d", int(level));
269 }
270 return Ice::Opt_2;
271 }
272
stdToIceMemoryOrder(std::memory_order memoryOrder)273 Ice::Intrinsics::MemoryOrder stdToIceMemoryOrder(std::memory_order memoryOrder)
274 {
275 switch(memoryOrder)
276 {
277 case std::memory_order_relaxed: return Ice::Intrinsics::MemoryOrderRelaxed;
278 case std::memory_order_consume: return Ice::Intrinsics::MemoryOrderConsume;
279 case std::memory_order_acquire: return Ice::Intrinsics::MemoryOrderAcquire;
280 case std::memory_order_release: return Ice::Intrinsics::MemoryOrderRelease;
281 case std::memory_order_acq_rel: return Ice::Intrinsics::MemoryOrderAcquireRelease;
282 case std::memory_order_seq_cst: return Ice::Intrinsics::MemoryOrderSequentiallyConsistent;
283 }
284 return Ice::Intrinsics::MemoryOrderInvalid;
285 }
286
287 class CPUID
288 {
289 public:
290 const static bool ARM;
291 const static bool SSE4_1;
292
293 private:
cpuid(int registers[4],int info)294 static void cpuid(int registers[4], int info)
295 {
296 #if defined(__i386__) || defined(__x86_64__)
297 # if defined(_WIN32)
298 __cpuid(registers, info);
299 # else
300 __asm volatile("cpuid"
301 : "=a"(registers[0]), "=b"(registers[1]), "=c"(registers[2]), "=d"(registers[3])
302 : "a"(info));
303 # endif
304 #else
305 registers[0] = 0;
306 registers[1] = 0;
307 registers[2] = 0;
308 registers[3] = 0;
309 #endif
310 }
311
detectARM()312 static bool detectARM()
313 {
314 #if defined(__arm__) || defined(__aarch64__)
315 return true;
316 #elif defined(__i386__) || defined(__x86_64__)
317 return false;
318 #elif defined(__mips__)
319 return false;
320 #else
321 # error "Unknown architecture"
322 #endif
323 }
324
detectSSE4_1()325 static bool detectSSE4_1()
326 {
327 #if defined(__i386__) || defined(__x86_64__)
328 int registers[4];
329 cpuid(registers, 1);
330 return (registers[2] & 0x00080000) != 0;
331 #else
332 return false;
333 #endif
334 }
335 };
336
337 const bool CPUID::ARM = CPUID::detectARM();
338 const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
339 const bool emulateIntrinsics = false;
340 const bool emulateMismatchedBitCast = CPUID::ARM;
341
342 constexpr bool subzeroDumpEnabled = false;
343 constexpr bool subzeroEmitTextAsm = false;
344
345 #if !ALLOW_DUMP
346 static_assert(!subzeroDumpEnabled, "Compile Subzero with ALLOW_DUMP=1 for subzeroDumpEnabled");
347 static_assert(!subzeroEmitTextAsm, "Compile Subzero with ALLOW_DUMP=1 for subzeroEmitTextAsm");
348 #endif
349
350 } // anonymous namespace
351
352 namespace rr {
353
BackendName()354 std::string BackendName()
355 {
356 return "Subzero";
357 }
358
359 const Capabilities Caps = {
360 true, // CoroutinesSupported
361 };
362
363 enum EmulatedType
364 {
365 EmulatedShift = 16,
366 EmulatedV2 = 2 << EmulatedShift,
367 EmulatedV4 = 4 << EmulatedShift,
368 EmulatedV8 = 8 << EmulatedShift,
369 EmulatedBits = EmulatedV2 | EmulatedV4 | EmulatedV8,
370
371 Type_v2i32 = Ice::IceType_v4i32 | EmulatedV2,
372 Type_v4i16 = Ice::IceType_v8i16 | EmulatedV4,
373 Type_v2i16 = Ice::IceType_v8i16 | EmulatedV2,
374 Type_v8i8 = Ice::IceType_v16i8 | EmulatedV8,
375 Type_v4i8 = Ice::IceType_v16i8 | EmulatedV4,
376 Type_v2f32 = Ice::IceType_v4f32 | EmulatedV2,
377 };
378
379 class Value : public Ice::Operand
380 {};
381 class SwitchCases : public Ice::InstSwitch
382 {};
383 class BasicBlock : public Ice::CfgNode
384 {};
385
T(Type * t)386 Ice::Type T(Type *t)
387 {
388 static_assert(static_cast<unsigned int>(Ice::IceType_NUM) < static_cast<unsigned int>(EmulatedBits), "Ice::Type overlaps with our emulated types!");
389 return (Ice::Type)(reinterpret_cast<std::intptr_t>(t) & ~EmulatedBits);
390 }
391
T(Ice::Type t)392 Type *T(Ice::Type t)
393 {
394 return reinterpret_cast<Type *>(t);
395 }
396
T(EmulatedType t)397 Type *T(EmulatedType t)
398 {
399 return reinterpret_cast<Type *>(t);
400 }
401
T(const std::vector<Type * > & types)402 std::vector<Ice::Type> T(const std::vector<Type *> &types)
403 {
404 std::vector<Ice::Type> result;
405 result.reserve(types.size());
406 for(auto &t : types)
407 {
408 result.push_back(T(t));
409 }
410 return result;
411 }
412
V(Ice::Operand * v)413 Value *V(Ice::Operand *v)
414 {
415 return reinterpret_cast<Value *>(v);
416 }
417
V(Value * v)418 Ice::Operand *V(Value *v)
419 {
420 return reinterpret_cast<Ice::Operand *>(v);
421 }
422
V(const std::vector<Value * > & values)423 std::vector<Ice::Operand *> V(const std::vector<Value *> &values)
424 {
425 std::vector<Ice::Operand *> result;
426 result.reserve(values.size());
427 for(auto &v : values)
428 {
429 result.push_back(V(v));
430 }
431 return result;
432 }
433
B(Ice::CfgNode * b)434 BasicBlock *B(Ice::CfgNode *b)
435 {
436 return reinterpret_cast<BasicBlock *>(b);
437 }
438
typeSize(Type * type)439 static size_t typeSize(Type *type)
440 {
441 if(reinterpret_cast<std::intptr_t>(type) & EmulatedBits)
442 {
443 switch(reinterpret_cast<std::intptr_t>(type))
444 {
445 case Type_v2i32: return 8;
446 case Type_v4i16: return 8;
447 case Type_v2i16: return 4;
448 case Type_v8i8: return 8;
449 case Type_v4i8: return 4;
450 case Type_v2f32: return 8;
451 default: ASSERT(false);
452 }
453 }
454
455 return Ice::typeWidthInBytes(T(type));
456 }
457
finalizeFunction()458 static void finalizeFunction()
459 {
460 // Create a return if none was added
461 if(::basicBlock->getInsts().empty() || ::basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
462 {
463 Nucleus::createRetVoid();
464 }
465
466 // Connect the entry block to the top of the initial basic block
467 auto br = Ice::InstBr::create(::function, ::basicBlockTop);
468 ::entryBlock->appendInst(br);
469 }
470
471 using ElfHeader = std::conditional<sizeof(void *) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
472 using SectionHeader = std::conditional<sizeof(void *) == 8, Elf64_Shdr, Elf32_Shdr>::type;
473
sectionHeader(const ElfHeader * elfHeader)474 inline const SectionHeader *sectionHeader(const ElfHeader *elfHeader)
475 {
476 return reinterpret_cast<const SectionHeader *>((intptr_t)elfHeader + elfHeader->e_shoff);
477 }
478
elfSection(const ElfHeader * elfHeader,int index)479 inline const SectionHeader *elfSection(const ElfHeader *elfHeader, int index)
480 {
481 return §ionHeader(elfHeader)[index];
482 }
483
relocateSymbol(const ElfHeader * elfHeader,const Elf32_Rel & relocation,const SectionHeader & relocationTable)484 static void *relocateSymbol(const ElfHeader *elfHeader, const Elf32_Rel &relocation, const SectionHeader &relocationTable)
485 {
486 const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
487
488 uint32_t index = relocation.getSymbol();
489 int table = relocationTable.sh_link;
490 void *symbolValue = nullptr;
491
492 if(index != SHN_UNDEF)
493 {
494 if(table == SHN_UNDEF) return nullptr;
495 const SectionHeader *symbolTable = elfSection(elfHeader, table);
496
497 uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
498 if(index >= symtab_entries)
499 {
500 ASSERT(index < symtab_entries && "Symbol Index out of range");
501 return nullptr;
502 }
503
504 intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
505 Elf32_Sym &symbol = ((Elf32_Sym *)symbolAddress)[index];
506 uint16_t section = symbol.st_shndx;
507
508 if(section != SHN_UNDEF && section < SHN_LORESERVE)
509 {
510 const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
511 symbolValue = reinterpret_cast<void *>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
512 }
513 else
514 {
515 return nullptr;
516 }
517 }
518
519 intptr_t address = (intptr_t)elfHeader + target->sh_offset;
520 unaligned_ptr<int32_t> patchSite = (int32_t *)(address + relocation.r_offset);
521
522 if(CPUID::ARM)
523 {
524 switch(relocation.getType())
525 {
526 case R_ARM_NONE:
527 // No relocation
528 break;
529 case R_ARM_MOVW_ABS_NC:
530 {
531 uint32_t thumb = 0; // Calls to Thumb code not supported.
532 uint32_t lo = (uint32_t)(intptr_t)symbolValue | thumb;
533 *patchSite = (*patchSite & 0xFFF0F000) | ((lo & 0xF000) << 4) | (lo & 0x0FFF);
534 }
535 break;
536 case R_ARM_MOVT_ABS:
537 {
538 uint32_t hi = (uint32_t)(intptr_t)(symbolValue) >> 16;
539 *patchSite = (*patchSite & 0xFFF0F000) | ((hi & 0xF000) << 4) | (hi & 0x0FFF);
540 }
541 break;
542 default:
543 ASSERT(false && "Unsupported relocation type");
544 return nullptr;
545 }
546 }
547 else
548 {
549 switch(relocation.getType())
550 {
551 case R_386_NONE:
552 // No relocation
553 break;
554 case R_386_32:
555 *patchSite = (int32_t)((intptr_t)symbolValue + *patchSite);
556 break;
557 case R_386_PC32:
558 *patchSite = (int32_t)((intptr_t)symbolValue + *patchSite - (intptr_t)patchSite);
559 break;
560 default:
561 ASSERT(false && "Unsupported relocation type");
562 return nullptr;
563 }
564 }
565
566 return symbolValue;
567 }
568
relocateSymbol(const ElfHeader * elfHeader,const Elf64_Rela & relocation,const SectionHeader & relocationTable)569 static void *relocateSymbol(const ElfHeader *elfHeader, const Elf64_Rela &relocation, const SectionHeader &relocationTable)
570 {
571 const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
572
573 uint32_t index = relocation.getSymbol();
574 int table = relocationTable.sh_link;
575 void *symbolValue = nullptr;
576
577 if(index != SHN_UNDEF)
578 {
579 if(table == SHN_UNDEF) return nullptr;
580 const SectionHeader *symbolTable = elfSection(elfHeader, table);
581
582 uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
583 if(index >= symtab_entries)
584 {
585 ASSERT(index < symtab_entries && "Symbol Index out of range");
586 return nullptr;
587 }
588
589 intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
590 Elf64_Sym &symbol = ((Elf64_Sym *)symbolAddress)[index];
591 uint16_t section = symbol.st_shndx;
592
593 if(section != SHN_UNDEF && section < SHN_LORESERVE)
594 {
595 const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
596 symbolValue = reinterpret_cast<void *>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
597 }
598 else
599 {
600 return nullptr;
601 }
602 }
603
604 intptr_t address = (intptr_t)elfHeader + target->sh_offset;
605 unaligned_ptr<int32_t> patchSite32 = (int32_t *)(address + relocation.r_offset);
606 unaligned_ptr<int64_t> patchSite64 = (int64_t *)(address + relocation.r_offset);
607
608 switch(relocation.getType())
609 {
610 case R_X86_64_NONE:
611 // No relocation
612 break;
613 case R_X86_64_64:
614 *patchSite64 = (int64_t)((intptr_t)symbolValue + *patchSite64 + relocation.r_addend);
615 break;
616 case R_X86_64_PC32:
617 *patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 - (intptr_t)patchSite32 + relocation.r_addend);
618 break;
619 case R_X86_64_32S:
620 *patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 + relocation.r_addend);
621 break;
622 default:
623 ASSERT(false && "Unsupported relocation type");
624 return nullptr;
625 }
626
627 return symbolValue;
628 }
629
630 struct EntryPoint
631 {
632 const void *entry;
633 size_t codeSize = 0;
634 };
635
loadImage(uint8_t * const elfImage,const std::vector<const char * > & functionNames)636 std::vector<EntryPoint> loadImage(uint8_t *const elfImage, const std::vector<const char *> &functionNames)
637 {
638 ASSERT(functionNames.size() > 0);
639 std::vector<EntryPoint> entryPoints(functionNames.size());
640
641 ElfHeader *elfHeader = (ElfHeader *)elfImage;
642
643 // TODO: assert?
644 if(!elfHeader->checkMagic())
645 {
646 return {};
647 }
648
649 // Expect ELF bitness to match platform
650 ASSERT(sizeof(void *) == 8 ? elfHeader->getFileClass() == ELFCLASS64 : elfHeader->getFileClass() == ELFCLASS32);
651 #if defined(__i386__)
652 ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_386);
653 #elif defined(__x86_64__)
654 ASSERT(sizeof(void *) == 8 && elfHeader->e_machine == EM_X86_64);
655 #elif defined(__arm__)
656 ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_ARM);
657 #elif defined(__aarch64__)
658 ASSERT(sizeof(void *) == 8 && elfHeader->e_machine == EM_AARCH64);
659 #elif defined(__mips__)
660 ASSERT(sizeof(void *) == 4 && elfHeader->e_machine == EM_MIPS);
661 #else
662 # error "Unsupported platform"
663 #endif
664
665 SectionHeader *sectionHeader = (SectionHeader *)(elfImage + elfHeader->e_shoff);
666
667 for(int i = 0; i < elfHeader->e_shnum; i++)
668 {
669 if(sectionHeader[i].sh_type == SHT_PROGBITS)
670 {
671 if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
672 {
673 auto findSectionNameEntryIndex = [&]() -> size_t {
674 auto sectionNameOffset = sectionHeader[elfHeader->e_shstrndx].sh_offset + sectionHeader[i].sh_name;
675 const char *sectionName = reinterpret_cast<const char *>(elfImage + sectionNameOffset);
676
677 for(size_t j = 0; j < functionNames.size(); ++j)
678 {
679 if(strstr(sectionName, functionNames[j]) != nullptr)
680 {
681 return j;
682 }
683 }
684
685 UNREACHABLE("Failed to find executable section that matches input function names");
686 return static_cast<size_t>(-1);
687 };
688
689 size_t index = findSectionNameEntryIndex();
690 entryPoints[index].entry = elfImage + sectionHeader[i].sh_offset;
691 entryPoints[index].codeSize = sectionHeader[i].sh_size;
692 }
693 }
694 else if(sectionHeader[i].sh_type == SHT_REL)
695 {
696 ASSERT(sizeof(void *) == 4 && "UNIMPLEMENTED"); // Only expected/implemented for 32-bit code
697
698 for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
699 {
700 const Elf32_Rel &relocation = ((const Elf32_Rel *)(elfImage + sectionHeader[i].sh_offset))[index];
701 relocateSymbol(elfHeader, relocation, sectionHeader[i]);
702 }
703 }
704 else if(sectionHeader[i].sh_type == SHT_RELA)
705 {
706 ASSERT(sizeof(void *) == 8 && "UNIMPLEMENTED"); // Only expected/implemented for 64-bit code
707
708 for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
709 {
710 const Elf64_Rela &relocation = ((const Elf64_Rela *)(elfImage + sectionHeader[i].sh_offset))[index];
711 relocateSymbol(elfHeader, relocation, sectionHeader[i]);
712 }
713 }
714 }
715
716 return entryPoints;
717 }
718
719 template<typename T>
720 struct ExecutableAllocator
721 {
ExecutableAllocatorrr::ExecutableAllocator722 ExecutableAllocator() {}
723 template<class U>
ExecutableAllocatorrr::ExecutableAllocator724 ExecutableAllocator(const ExecutableAllocator<U> &other)
725 {}
726
727 using value_type = T;
728 using size_type = std::size_t;
729
allocaterr::ExecutableAllocator730 T *allocate(size_type n)
731 {
732 return (T *)allocateMemoryPages(
733 sizeof(T) * n, PERMISSION_READ | PERMISSION_WRITE, true);
734 }
735
deallocaterr::ExecutableAllocator736 void deallocate(T *p, size_type n)
737 {
738 deallocateMemoryPages(p, sizeof(T) * n);
739 }
740 };
741
742 class ELFMemoryStreamer : public Ice::ELFStreamer, public Routine
743 {
744 ELFMemoryStreamer(const ELFMemoryStreamer &) = delete;
745 ELFMemoryStreamer &operator=(const ELFMemoryStreamer &) = delete;
746
747 public:
ELFMemoryStreamer()748 ELFMemoryStreamer()
749 : Routine()
750 {
751 position = 0;
752 buffer.reserve(0x1000);
753 }
754
~ELFMemoryStreamer()755 ~ELFMemoryStreamer() override
756 {
757 }
758
write8(uint8_t Value)759 void write8(uint8_t Value) override
760 {
761 if(position == (uint64_t)buffer.size())
762 {
763 buffer.push_back(Value);
764 position++;
765 }
766 else if(position < (uint64_t)buffer.size())
767 {
768 buffer[position] = Value;
769 position++;
770 }
771 else
772 ASSERT(false && "UNIMPLEMENTED");
773 }
774
writeBytes(llvm::StringRef Bytes)775 void writeBytes(llvm::StringRef Bytes) override
776 {
777 std::size_t oldSize = buffer.size();
778 buffer.resize(oldSize + Bytes.size());
779 memcpy(&buffer[oldSize], Bytes.begin(), Bytes.size());
780 position += Bytes.size();
781 }
782
tell() const783 uint64_t tell() const override { return position; }
784
seek(uint64_t Off)785 void seek(uint64_t Off) override { position = Off; }
786
loadImageAndGetEntryPoints(const std::vector<const char * > & functionNames)787 std::vector<EntryPoint> loadImageAndGetEntryPoints(const std::vector<const char *> &functionNames)
788 {
789 auto entryPoints = loadImage(&buffer[0], functionNames);
790
791 #if defined(_WIN32)
792 FlushInstructionCache(GetCurrentProcess(), NULL, 0);
793 #else
794 for(auto &entryPoint : entryPoints)
795 {
796 __builtin___clear_cache((char *)entryPoint.entry, (char *)entryPoint.entry + entryPoint.codeSize);
797 }
798 #endif
799
800 return entryPoints;
801 }
802
finalize()803 void finalize()
804 {
805 position = std::numeric_limits<std::size_t>::max(); // Can't stream more data after this
806
807 protectMemoryPages(&buffer[0], buffer.size(), PERMISSION_READ | PERMISSION_EXECUTE);
808 }
809
setEntry(int index,const void * func)810 void setEntry(int index, const void *func)
811 {
812 ASSERT(func);
813 funcs[index] = func;
814 }
815
getEntry(int index) const816 const void *getEntry(int index) const override
817 {
818 ASSERT(funcs[index]);
819 return funcs[index];
820 }
821
addConstantData(const void * data,size_t size,size_t alignment=1)822 const void *addConstantData(const void *data, size_t size, size_t alignment = 1)
823 {
824 // Check if we already have a suitable constant.
825 for(const auto &c : constantsPool)
826 {
827 void *ptr = c.data.get();
828 size_t space = c.space;
829
830 void *alignedPtr = std::align(alignment, size, ptr, space);
831
832 if(space < size)
833 {
834 continue;
835 }
836
837 if(memcmp(data, alignedPtr, size) == 0)
838 {
839 return alignedPtr;
840 }
841 }
842
843 // TODO(b/148086935): Replace with a buffer allocator.
844 size_t space = size + alignment;
845 auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[space]);
846 void *ptr = buf.get();
847 void *alignedPtr = std::align(alignment, size, ptr, space);
848 ASSERT(alignedPtr);
849 memcpy(alignedPtr, data, size);
850 constantsPool.emplace_back(std::move(buf), space);
851
852 return alignedPtr;
853 }
854
855 private:
856 struct Constant
857 {
Constantrr::ELFMemoryStreamer::Constant858 Constant(std::unique_ptr<uint8_t[]> data, size_t space)
859 : data(std::move(data))
860 , space(space)
861 {}
862
863 std::unique_ptr<uint8_t[]> data;
864 size_t space;
865 };
866
867 std::array<const void *, Nucleus::CoroutineEntryCount> funcs = {};
868 std::vector<uint8_t, ExecutableAllocator<uint8_t>> buffer;
869 std::size_t position;
870 std::vector<Constant> constantsPool;
871 };
872
873 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)874 void VPrintf(const std::vector<Value *> &vals)
875 {
876 sz::Call(::function, ::basicBlock, Ice::IceType_i32, reinterpret_cast<const void *>(rr::DebugPrintf), V(vals), true);
877 }
878 #endif // ENABLE_RR_PRINT
879
Nucleus()880 Nucleus::Nucleus()
881 {
882 ::codegenMutex.lock(); // SubzeroReactor is currently not thread safe
883
884 Ice::ClFlags &Flags = Ice::ClFlags::Flags;
885 Ice::ClFlags::getParsedClFlags(Flags);
886
887 #if defined(__arm__)
888 Flags.setTargetArch(Ice::Target_ARM32);
889 Flags.setTargetInstructionSet(Ice::ARM32InstructionSet_HWDivArm);
890 #elif defined(__mips__)
891 Flags.setTargetArch(Ice::Target_MIPS32);
892 Flags.setTargetInstructionSet(Ice::BaseInstructionSet);
893 #else // x86
894 Flags.setTargetArch(sizeof(void *) == 8 ? Ice::Target_X8664 : Ice::Target_X8632);
895 Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
896 #endif
897 Flags.setOutFileType(Ice::FT_Elf);
898 Flags.setOptLevel(toIce(getDefaultConfig().getOptimization().getLevel()));
899 Flags.setApplicationBinaryInterface(Ice::ABI_Platform);
900 Flags.setVerbose(subzeroDumpEnabled ? Ice::IceV_Most : Ice::IceV_None);
901 Flags.setDisableHybridAssembly(true);
902
903 // Emit functions into separate sections in the ELF so we can find them by name
904 Flags.setFunctionSections(true);
905
906 static llvm::raw_os_ostream cout(std::cout);
907 static llvm::raw_os_ostream cerr(std::cerr);
908
909 if(subzeroEmitTextAsm)
910 {
911 // Decorate text asm with liveness info
912 Flags.setDecorateAsm(true);
913 }
914
915 if(false) // Write out to a file
916 {
917 std::error_code errorCode;
918 ::out = new Ice::Fdstream("out.o", errorCode, llvm::sys::fs::F_None);
919 ::elfFile = new Ice::ELFFileStreamer(*out);
920 ::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfFile);
921 }
922 else
923 {
924 ELFMemoryStreamer *elfMemory = new ELFMemoryStreamer();
925 ::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfMemory);
926 ::routine = elfMemory;
927 }
928
929 #if !__has_feature(memory_sanitizer)
930 // thread_local variables in shared libraries are initialized at load-time,
931 // but this is not observed by MemorySanitizer if the loader itself was not
932 // instrumented, leading to false-positive unitialized variable errors.
933 ASSERT(Variable::unmaterializedVariables == nullptr);
934 #endif
935 Variable::unmaterializedVariables = new Variable::UnmaterializedVariables{};
936 }
937
~Nucleus()938 Nucleus::~Nucleus()
939 {
940 delete Variable::unmaterializedVariables;
941 Variable::unmaterializedVariables = nullptr;
942
943 delete ::routine;
944 ::routine = nullptr;
945
946 delete ::allocator;
947 ::allocator = nullptr;
948
949 delete ::function;
950 ::function = nullptr;
951
952 delete ::context;
953 ::context = nullptr;
954
955 delete ::elfFile;
956 ::elfFile = nullptr;
957
958 delete ::out;
959 ::out = nullptr;
960
961 ::entryBlock = nullptr;
962 ::basicBlock = nullptr;
963 ::basicBlockTop = nullptr;
964
965 ::codegenMutex.unlock();
966 }
967
setDefaultConfig(const Config & cfg)968 void Nucleus::setDefaultConfig(const Config &cfg)
969 {
970 std::unique_lock<std::mutex> lock(::defaultConfigLock);
971 ::defaultConfig() = cfg;
972 }
973
adjustDefaultConfig(const Config::Edit & cfgEdit)974 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
975 {
976 std::unique_lock<std::mutex> lock(::defaultConfigLock);
977 auto &config = ::defaultConfig();
978 config = cfgEdit.apply(config);
979 }
980
getDefaultConfig()981 Config Nucleus::getDefaultConfig()
982 {
983 std::unique_lock<std::mutex> lock(::defaultConfigLock);
984 return ::defaultConfig();
985 }
986
987 // This function lowers and produces executable binary code in memory for the input functions,
988 // and returns a Routine with the entry points to these functions.
989 template<size_t Count>
acquireRoutine(Ice::Cfg * const (& functions)[Count],const char * const (& names)[Count],const Config::Edit & cfgEdit)990 static std::shared_ptr<Routine> acquireRoutine(Ice::Cfg *const (&functions)[Count], const char *const (&names)[Count], const Config::Edit &cfgEdit)
991 {
992 // This logic is modeled after the IceCompiler, as well as GlobalContext::translateFunctions
993 // and GlobalContext::emitItems.
994
995 if(subzeroDumpEnabled)
996 {
997 // Output dump strings immediately, rather than once buffer is full. Useful for debugging.
998 ::context->getStrDump().SetUnbuffered();
999 }
1000
1001 ::context->emitFileHeader();
1002
1003 // Translate
1004
1005 for(size_t i = 0; i < Count; ++i)
1006 {
1007 Ice::Cfg *currFunc = functions[i];
1008
1009 // Install function allocator in TLS for Cfg-specific container allocators
1010 Ice::CfgLocalAllocatorScope allocScope(currFunc);
1011
1012 currFunc->setFunctionName(Ice::GlobalString::createWithString(::context, names[i]));
1013
1014 if(::optimizerCallback)
1015 {
1016 Nucleus::OptimizerReport report;
1017 rr::optimize(currFunc, &report);
1018 ::optimizerCallback(&report);
1019 ::optimizerCallback = nullptr;
1020 }
1021 else
1022 {
1023 rr::optimize(currFunc);
1024 }
1025
1026 currFunc->computeInOutEdges();
1027 ASSERT_MSG(!currFunc->hasError(), "%s", currFunc->getError().c_str());
1028
1029 currFunc->translate();
1030 ASSERT_MSG(!currFunc->hasError(), "%s", currFunc->getError().c_str());
1031
1032 currFunc->getAssembler<>()->setInternal(currFunc->getInternal());
1033
1034 if(subzeroEmitTextAsm)
1035 {
1036 currFunc->emit();
1037 }
1038
1039 currFunc->emitIAS();
1040
1041 if(currFunc->hasError())
1042 {
1043 return nullptr;
1044 }
1045 }
1046
1047 // Emit items
1048
1049 ::context->lowerGlobals("");
1050
1051 auto objectWriter = ::context->getObjectWriter();
1052
1053 for(size_t i = 0; i < Count; ++i)
1054 {
1055 Ice::Cfg *currFunc = functions[i];
1056
1057 // Accumulate globals from functions to emit into the "last" section at the end
1058 auto globals = currFunc->getGlobalInits();
1059 if(globals && !globals->empty())
1060 {
1061 ::context->getGlobals()->merge(globals.get());
1062 }
1063
1064 auto assembler = currFunc->releaseAssembler();
1065 assembler->alignFunction();
1066 objectWriter->writeFunctionCode(currFunc->getFunctionName(), currFunc->getInternal(), assembler.get());
1067 }
1068
1069 ::context->lowerGlobals("last");
1070 ::context->lowerConstants();
1071 ::context->lowerJumpTables();
1072
1073 objectWriter->setUndefinedSyms(::context->getConstantExternSyms());
1074 ::context->emitTargetRODataSections();
1075 objectWriter->writeNonUserSections();
1076
1077 // Done compiling functions, get entry pointers to each of them
1078 auto entryPoints = ::routine->loadImageAndGetEntryPoints({ names, names + Count });
1079 ASSERT(entryPoints.size() == Count);
1080 for(size_t i = 0; i < entryPoints.size(); ++i)
1081 {
1082 ::routine->setEntry(i, entryPoints[i].entry);
1083 }
1084
1085 ::routine->finalize();
1086
1087 Routine *handoffRoutine = ::routine;
1088 ::routine = nullptr;
1089
1090 return std::shared_ptr<Routine>(handoffRoutine);
1091 }
1092
acquireRoutine(const char * name,const Config::Edit & cfgEdit)1093 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
1094 {
1095 finalizeFunction();
1096 return rr::acquireRoutine({ ::function }, { name }, cfgEdit);
1097 }
1098
allocateStackVariable(Type * t,int arraySize)1099 Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
1100 {
1101 Ice::Type type = T(t);
1102 int typeSize = Ice::typeWidthInBytes(type);
1103 int totalSize = typeSize * (arraySize ? arraySize : 1);
1104
1105 auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
1106 auto address = ::function->makeVariable(T(getPointerType(t)));
1107 auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize); // SRoA depends on the alignment to match the type size.
1108 ::function->getEntryNode()->getInsts().push_front(alloca);
1109
1110 return V(address);
1111 }
1112
createBasicBlock()1113 BasicBlock *Nucleus::createBasicBlock()
1114 {
1115 return B(::function->makeNode());
1116 }
1117
getInsertBlock()1118 BasicBlock *Nucleus::getInsertBlock()
1119 {
1120 return B(::basicBlock);
1121 }
1122
setInsertBlock(BasicBlock * basicBlock)1123 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
1124 {
1125 // ASSERT(::basicBlock->getInsts().back().getTerminatorEdges().size() >= 0 && "Previous basic block must have a terminator");
1126
1127 ::basicBlock = basicBlock;
1128 }
1129
createFunction(Type * returnType,const std::vector<Type * > & paramTypes)1130 void Nucleus::createFunction(Type *returnType, const std::vector<Type *> ¶mTypes)
1131 {
1132 ASSERT(::function == nullptr);
1133 ASSERT(::allocator == nullptr);
1134 ASSERT(::entryBlock == nullptr);
1135 ASSERT(::basicBlock == nullptr);
1136 ASSERT(::basicBlockTop == nullptr);
1137
1138 ::function = sz::createFunction(::context, T(returnType), T(paramTypes));
1139
1140 // NOTE: The scoped allocator sets the TLS allocator to the one in the function. This global one
1141 // becomes invalid if another one is created; for example, when creating await and destroy functions
1142 // for coroutines, in which case, we must make sure to create a new scoped allocator for ::function again.
1143 // TODO: Get rid of this as a global, and create scoped allocs in every Nucleus function instead.
1144 ::allocator = new Ice::CfgLocalAllocatorScope(::function);
1145
1146 ::entryBlock = ::function->getEntryNode();
1147 ::basicBlock = ::function->makeNode();
1148 ::basicBlockTop = ::basicBlock;
1149 }
1150
getArgument(unsigned int index)1151 Value *Nucleus::getArgument(unsigned int index)
1152 {
1153 return V(::function->getArgs()[index]);
1154 }
1155
createRetVoid()1156 void Nucleus::createRetVoid()
1157 {
1158 RR_DEBUG_INFO_UPDATE_LOC();
1159
1160 // Code generated after this point is unreachable, so any variables
1161 // being read can safely return an undefined value. We have to avoid
1162 // materializing variables after the terminator ret instruction.
1163 Variable::killUnmaterialized();
1164
1165 Ice::InstRet *ret = Ice::InstRet::create(::function);
1166 ::basicBlock->appendInst(ret);
1167 }
1168
createRet(Value * v)1169 void Nucleus::createRet(Value *v)
1170 {
1171 RR_DEBUG_INFO_UPDATE_LOC();
1172
1173 // Code generated after this point is unreachable, so any variables
1174 // being read can safely return an undefined value. We have to avoid
1175 // materializing variables after the terminator ret instruction.
1176 Variable::killUnmaterialized();
1177
1178 Ice::InstRet *ret = Ice::InstRet::create(::function, v);
1179 ::basicBlock->appendInst(ret);
1180 }
1181
createBr(BasicBlock * dest)1182 void Nucleus::createBr(BasicBlock *dest)
1183 {
1184 RR_DEBUG_INFO_UPDATE_LOC();
1185 Variable::materializeAll();
1186
1187 auto br = Ice::InstBr::create(::function, dest);
1188 ::basicBlock->appendInst(br);
1189 }
1190
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)1191 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
1192 {
1193 RR_DEBUG_INFO_UPDATE_LOC();
1194 Variable::materializeAll();
1195
1196 auto br = Ice::InstBr::create(::function, cond, ifTrue, ifFalse);
1197 ::basicBlock->appendInst(br);
1198 }
1199
isCommutative(Ice::InstArithmetic::OpKind op)1200 static bool isCommutative(Ice::InstArithmetic::OpKind op)
1201 {
1202 switch(op)
1203 {
1204 case Ice::InstArithmetic::Add:
1205 case Ice::InstArithmetic::Fadd:
1206 case Ice::InstArithmetic::Mul:
1207 case Ice::InstArithmetic::Fmul:
1208 case Ice::InstArithmetic::And:
1209 case Ice::InstArithmetic::Or:
1210 case Ice::InstArithmetic::Xor:
1211 return true;
1212 default:
1213 return false;
1214 }
1215 }
1216
createArithmetic(Ice::InstArithmetic::OpKind op,Value * lhs,Value * rhs)1217 static Value *createArithmetic(Ice::InstArithmetic::OpKind op, Value *lhs, Value *rhs)
1218 {
1219 ASSERT(lhs->getType() == rhs->getType() || llvm::isa<Ice::Constant>(rhs));
1220
1221 bool swapOperands = llvm::isa<Ice::Constant>(lhs) && isCommutative(op);
1222
1223 Ice::Variable *result = ::function->makeVariable(lhs->getType());
1224 Ice::InstArithmetic *arithmetic = Ice::InstArithmetic::create(::function, op, result, swapOperands ? rhs : lhs, swapOperands ? lhs : rhs);
1225 ::basicBlock->appendInst(arithmetic);
1226
1227 return V(result);
1228 }
1229
createAdd(Value * lhs,Value * rhs)1230 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
1231 {
1232 RR_DEBUG_INFO_UPDATE_LOC();
1233 return createArithmetic(Ice::InstArithmetic::Add, lhs, rhs);
1234 }
1235
createSub(Value * lhs,Value * rhs)1236 Value *Nucleus::createSub(Value *lhs, Value *rhs)
1237 {
1238 RR_DEBUG_INFO_UPDATE_LOC();
1239 return createArithmetic(Ice::InstArithmetic::Sub, lhs, rhs);
1240 }
1241
createMul(Value * lhs,Value * rhs)1242 Value *Nucleus::createMul(Value *lhs, Value *rhs)
1243 {
1244 RR_DEBUG_INFO_UPDATE_LOC();
1245 return createArithmetic(Ice::InstArithmetic::Mul, lhs, rhs);
1246 }
1247
createUDiv(Value * lhs,Value * rhs)1248 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
1249 {
1250 RR_DEBUG_INFO_UPDATE_LOC();
1251 return createArithmetic(Ice::InstArithmetic::Udiv, lhs, rhs);
1252 }
1253
createSDiv(Value * lhs,Value * rhs)1254 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
1255 {
1256 RR_DEBUG_INFO_UPDATE_LOC();
1257 return createArithmetic(Ice::InstArithmetic::Sdiv, lhs, rhs);
1258 }
1259
createFAdd(Value * lhs,Value * rhs)1260 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
1261 {
1262 RR_DEBUG_INFO_UPDATE_LOC();
1263 return createArithmetic(Ice::InstArithmetic::Fadd, lhs, rhs);
1264 }
1265
createFSub(Value * lhs,Value * rhs)1266 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
1267 {
1268 RR_DEBUG_INFO_UPDATE_LOC();
1269 return createArithmetic(Ice::InstArithmetic::Fsub, lhs, rhs);
1270 }
1271
createFMul(Value * lhs,Value * rhs)1272 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
1273 {
1274 RR_DEBUG_INFO_UPDATE_LOC();
1275 return createArithmetic(Ice::InstArithmetic::Fmul, lhs, rhs);
1276 }
1277
createFDiv(Value * lhs,Value * rhs)1278 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
1279 {
1280 RR_DEBUG_INFO_UPDATE_LOC();
1281 return createArithmetic(Ice::InstArithmetic::Fdiv, lhs, rhs);
1282 }
1283
createURem(Value * lhs,Value * rhs)1284 Value *Nucleus::createURem(Value *lhs, Value *rhs)
1285 {
1286 RR_DEBUG_INFO_UPDATE_LOC();
1287 return createArithmetic(Ice::InstArithmetic::Urem, lhs, rhs);
1288 }
1289
createSRem(Value * lhs,Value * rhs)1290 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
1291 {
1292 RR_DEBUG_INFO_UPDATE_LOC();
1293 return createArithmetic(Ice::InstArithmetic::Srem, lhs, rhs);
1294 }
1295
createFRem(Value * lhs,Value * rhs)1296 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
1297 {
1298 RR_DEBUG_INFO_UPDATE_LOC();
1299 // TODO(b/148139679) Fix Subzero generating invalid code for FRem on vector types
1300 // createArithmetic(Ice::InstArithmetic::Frem, lhs, rhs);
1301 UNIMPLEMENTED("b/148139679 Nucleus::createFRem");
1302 return nullptr;
1303 }
1304
operator %(RValue<Float4> lhs,RValue<Float4> rhs)1305 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
1306 {
1307 return emulated::FRem(lhs, rhs);
1308 }
1309
createShl(Value * lhs,Value * rhs)1310 Value *Nucleus::createShl(Value *lhs, Value *rhs)
1311 {
1312 RR_DEBUG_INFO_UPDATE_LOC();
1313 return createArithmetic(Ice::InstArithmetic::Shl, lhs, rhs);
1314 }
1315
createLShr(Value * lhs,Value * rhs)1316 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
1317 {
1318 RR_DEBUG_INFO_UPDATE_LOC();
1319 return createArithmetic(Ice::InstArithmetic::Lshr, lhs, rhs);
1320 }
1321
createAShr(Value * lhs,Value * rhs)1322 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
1323 {
1324 RR_DEBUG_INFO_UPDATE_LOC();
1325 return createArithmetic(Ice::InstArithmetic::Ashr, lhs, rhs);
1326 }
1327
createAnd(Value * lhs,Value * rhs)1328 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
1329 {
1330 RR_DEBUG_INFO_UPDATE_LOC();
1331 return createArithmetic(Ice::InstArithmetic::And, lhs, rhs);
1332 }
1333
createOr(Value * lhs,Value * rhs)1334 Value *Nucleus::createOr(Value *lhs, Value *rhs)
1335 {
1336 RR_DEBUG_INFO_UPDATE_LOC();
1337 return createArithmetic(Ice::InstArithmetic::Or, lhs, rhs);
1338 }
1339
createXor(Value * lhs,Value * rhs)1340 Value *Nucleus::createXor(Value *lhs, Value *rhs)
1341 {
1342 RR_DEBUG_INFO_UPDATE_LOC();
1343 return createArithmetic(Ice::InstArithmetic::Xor, lhs, rhs);
1344 }
1345
createNeg(Value * v)1346 Value *Nucleus::createNeg(Value *v)
1347 {
1348 RR_DEBUG_INFO_UPDATE_LOC();
1349 return createSub(createNullValue(T(v->getType())), v);
1350 }
1351
createFNeg(Value * v)1352 Value *Nucleus::createFNeg(Value *v)
1353 {
1354 RR_DEBUG_INFO_UPDATE_LOC();
1355 double c[4] = { -0.0, -0.0, -0.0, -0.0 };
1356 Value *negativeZero = Ice::isVectorType(v->getType()) ? createConstantVector(c, T(v->getType())) : V(::context->getConstantFloat(-0.0f));
1357
1358 return createFSub(negativeZero, v);
1359 }
1360
createNot(Value * v)1361 Value *Nucleus::createNot(Value *v)
1362 {
1363 RR_DEBUG_INFO_UPDATE_LOC();
1364 if(Ice::isScalarIntegerType(v->getType()))
1365 {
1366 return createXor(v, V(::context->getConstantInt(v->getType(), -1)));
1367 }
1368 else // Vector
1369 {
1370 int64_t c[16] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
1371 return createXor(v, createConstantVector(c, T(v->getType())));
1372 }
1373 }
1374
validateAtomicAndMemoryOrderArgs(bool atomic,std::memory_order memoryOrder)1375 static void validateAtomicAndMemoryOrderArgs(bool atomic, std::memory_order memoryOrder)
1376 {
1377 #if defined(__i386__) || defined(__x86_64__)
1378 // We're good, atomics and strictest memory order (except seq_cst) are guaranteed.
1379 // Note that sequential memory ordering could be guaranteed by using x86's LOCK prefix.
1380 // Note also that relaxed memory order could be implemented using MOVNTPS and friends.
1381 #else
1382 if(atomic)
1383 {
1384 UNIMPLEMENTED("b/150475088 Atomic load/store not implemented for current platform");
1385 }
1386 if(memoryOrder != std::memory_order_relaxed)
1387 {
1388 UNIMPLEMENTED("b/150475088 Memory order other than memory_order_relaxed not implemented for current platform");
1389 }
1390 #endif
1391
1392 // Vulkan doesn't allow sequential memory order
1393 ASSERT(memoryOrder != std::memory_order_seq_cst);
1394 }
1395
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int align,bool atomic,std::memory_order memoryOrder)1396 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
1397 {
1398 RR_DEBUG_INFO_UPDATE_LOC();
1399 validateAtomicAndMemoryOrderArgs(atomic, memoryOrder);
1400
1401 int valueType = (int)reinterpret_cast<intptr_t>(type);
1402 Ice::Variable *result = nullptr;
1403
1404 if((valueType & EmulatedBits) && (align != 0)) // Narrow vector not stored on stack.
1405 {
1406 if(emulateIntrinsics)
1407 {
1408 if(typeSize(type) == 4)
1409 {
1410 auto pointer = RValue<Pointer<Byte>>(ptr);
1411 Int x = *Pointer<Int>(pointer);
1412
1413 Int4 vector;
1414 vector = Insert(vector, x, 0);
1415
1416 result = ::function->makeVariable(T(type));
1417 auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
1418 ::basicBlock->appendInst(bitcast);
1419 }
1420 else if(typeSize(type) == 8)
1421 {
1422 ASSERT_MSG(!atomic, "Emulated 64-bit loads are not atomic");
1423 auto pointer = RValue<Pointer<Byte>>(ptr);
1424 Int x = *Pointer<Int>(pointer);
1425 Int y = *Pointer<Int>(pointer + 4);
1426
1427 Int4 vector;
1428 vector = Insert(vector, x, 0);
1429 vector = Insert(vector, y, 1);
1430
1431 result = ::function->makeVariable(T(type));
1432 auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
1433 ::basicBlock->appendInst(bitcast);
1434 }
1435 else
1436 UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
1437 }
1438 else
1439 {
1440 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
1441 result = ::function->makeVariable(T(type));
1442 auto load = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
1443 load->addArg(ptr);
1444 load->addArg(::context->getConstantInt32(typeSize(type)));
1445 ::basicBlock->appendInst(load);
1446 }
1447 }
1448 else
1449 {
1450 result = sz::createLoad(::function, ::basicBlock, V(ptr), T(type), align);
1451 }
1452
1453 ASSERT(result);
1454 return V(result);
1455 }
1456
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int align,bool atomic,std::memory_order memoryOrder)1457 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
1458 {
1459 RR_DEBUG_INFO_UPDATE_LOC();
1460 validateAtomicAndMemoryOrderArgs(atomic, memoryOrder);
1461
1462 #if __has_feature(memory_sanitizer)
1463 // Mark all (non-stack) memory writes as initialized by calling __msan_unpoison
1464 if(align != 0)
1465 {
1466 auto call = Ice::InstCall::create(::function, 2, nullptr, ::context->getConstantInt64(reinterpret_cast<intptr_t>(__msan_unpoison)), false);
1467 call->addArg(ptr);
1468 call->addArg(::context->getConstantInt64(typeSize(type)));
1469 ::basicBlock->appendInst(call);
1470 }
1471 #endif
1472
1473 int valueType = (int)reinterpret_cast<intptr_t>(type);
1474
1475 if((valueType & EmulatedBits) && (align != 0)) // Narrow vector not stored on stack.
1476 {
1477 if(emulateIntrinsics)
1478 {
1479 if(typeSize(type) == 4)
1480 {
1481 Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
1482 auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
1483 ::basicBlock->appendInst(bitcast);
1484
1485 RValue<Int4> v(V(vector));
1486
1487 auto pointer = RValue<Pointer<Byte>>(ptr);
1488 Int x = Extract(v, 0);
1489 *Pointer<Int>(pointer) = x;
1490 }
1491 else if(typeSize(type) == 8)
1492 {
1493 ASSERT_MSG(!atomic, "Emulated 64-bit stores are not atomic");
1494 Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
1495 auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
1496 ::basicBlock->appendInst(bitcast);
1497
1498 RValue<Int4> v(V(vector));
1499
1500 auto pointer = RValue<Pointer<Byte>>(ptr);
1501 Int x = Extract(v, 0);
1502 *Pointer<Int>(pointer) = x;
1503 Int y = Extract(v, 1);
1504 *Pointer<Int>(pointer + 4) = y;
1505 }
1506 else
1507 UNREACHABLE("typeSize(type): %d", int(typeSize(type)));
1508 }
1509 else
1510 {
1511 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1512 auto store = Ice::InstIntrinsic::create(::function, 3, nullptr, intrinsic);
1513 store->addArg(value);
1514 store->addArg(ptr);
1515 store->addArg(::context->getConstantInt32(typeSize(type)));
1516 ::basicBlock->appendInst(store);
1517 }
1518 }
1519 else
1520 {
1521 ASSERT(value->getType() == T(type));
1522
1523 auto store = Ice::InstStore::create(::function, V(value), V(ptr), align);
1524 ::basicBlock->appendInst(store);
1525 }
1526
1527 return value;
1528 }
1529
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1530 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1531 {
1532 RR_DEBUG_INFO_UPDATE_LOC();
1533 ASSERT(index->getType() == Ice::IceType_i32);
1534
1535 if(auto *constant = llvm::dyn_cast<Ice::ConstantInteger32>(index))
1536 {
1537 int32_t offset = constant->getValue() * (int)typeSize(type);
1538
1539 if(offset == 0)
1540 {
1541 return ptr;
1542 }
1543
1544 return createAdd(ptr, createConstantInt(offset));
1545 }
1546
1547 if(!Ice::isByteSizedType(T(type)))
1548 {
1549 index = createMul(index, createConstantInt((int)typeSize(type)));
1550 }
1551
1552 if(sizeof(void *) == 8)
1553 {
1554 if(unsignedIndex)
1555 {
1556 index = createZExt(index, T(Ice::IceType_i64));
1557 }
1558 else
1559 {
1560 index = createSExt(index, T(Ice::IceType_i64));
1561 }
1562 }
1563
1564 return createAdd(ptr, index);
1565 }
1566
createAtomicRMW(Ice::Intrinsics::AtomicRMWOperation rmwOp,Value * ptr,Value * value,std::memory_order memoryOrder)1567 static Value *createAtomicRMW(Ice::Intrinsics::AtomicRMWOperation rmwOp, Value *ptr, Value *value, std::memory_order memoryOrder)
1568 {
1569 Ice::Variable *result = ::function->makeVariable(value->getType());
1570
1571 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicRMW, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1572 auto inst = Ice::InstIntrinsic::create(::function, 0, result, intrinsic);
1573 auto op = ::context->getConstantInt32(rmwOp);
1574 auto order = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrder));
1575 inst->addArg(op);
1576 inst->addArg(ptr);
1577 inst->addArg(value);
1578 inst->addArg(order);
1579 ::basicBlock->appendInst(inst);
1580
1581 return V(result);
1582 }
1583
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1584 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1585 {
1586 RR_DEBUG_INFO_UPDATE_LOC();
1587 return createAtomicRMW(Ice::Intrinsics::AtomicAdd, ptr, value, memoryOrder);
1588 }
1589
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1590 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1591 {
1592 RR_DEBUG_INFO_UPDATE_LOC();
1593 return createAtomicRMW(Ice::Intrinsics::AtomicSub, ptr, value, memoryOrder);
1594 }
1595
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1596 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1597 {
1598 RR_DEBUG_INFO_UPDATE_LOC();
1599 return createAtomicRMW(Ice::Intrinsics::AtomicAnd, ptr, value, memoryOrder);
1600 }
1601
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1602 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1603 {
1604 RR_DEBUG_INFO_UPDATE_LOC();
1605 return createAtomicRMW(Ice::Intrinsics::AtomicOr, ptr, value, memoryOrder);
1606 }
1607
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1608 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1609 {
1610 RR_DEBUG_INFO_UPDATE_LOC();
1611 return createAtomicRMW(Ice::Intrinsics::AtomicXor, ptr, value, memoryOrder);
1612 }
1613
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1614 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1615 {
1616 RR_DEBUG_INFO_UPDATE_LOC();
1617 return createAtomicRMW(Ice::Intrinsics::AtomicExchange, ptr, value, memoryOrder);
1618 }
1619
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1620 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1621 {
1622 RR_DEBUG_INFO_UPDATE_LOC();
1623 Ice::Variable *result = ::function->makeVariable(value->getType());
1624
1625 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicCmpxchg, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T };
1626 auto inst = Ice::InstIntrinsic::create(::function, 0, result, intrinsic);
1627 auto orderEq = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrderEqual));
1628 auto orderNeq = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrderUnequal));
1629 inst->addArg(ptr);
1630 inst->addArg(compare);
1631 inst->addArg(value);
1632 inst->addArg(orderEq);
1633 inst->addArg(orderNeq);
1634 ::basicBlock->appendInst(inst);
1635
1636 return V(result);
1637 }
1638
createCast(Ice::InstCast::OpKind op,Value * v,Type * destType)1639 static Value *createCast(Ice::InstCast::OpKind op, Value *v, Type *destType)
1640 {
1641 if(v->getType() == T(destType))
1642 {
1643 return v;
1644 }
1645
1646 Ice::Variable *result = ::function->makeVariable(T(destType));
1647 Ice::InstCast *cast = Ice::InstCast::create(::function, op, result, v);
1648 ::basicBlock->appendInst(cast);
1649
1650 return V(result);
1651 }
1652
createTrunc(Value * v,Type * destType)1653 Value *Nucleus::createTrunc(Value *v, Type *destType)
1654 {
1655 RR_DEBUG_INFO_UPDATE_LOC();
1656 return createCast(Ice::InstCast::Trunc, v, destType);
1657 }
1658
createZExt(Value * v,Type * destType)1659 Value *Nucleus::createZExt(Value *v, Type *destType)
1660 {
1661 RR_DEBUG_INFO_UPDATE_LOC();
1662 return createCast(Ice::InstCast::Zext, v, destType);
1663 }
1664
createSExt(Value * v,Type * destType)1665 Value *Nucleus::createSExt(Value *v, Type *destType)
1666 {
1667 RR_DEBUG_INFO_UPDATE_LOC();
1668 return createCast(Ice::InstCast::Sext, v, destType);
1669 }
1670
createFPToUI(Value * v,Type * destType)1671 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1672 {
1673 RR_DEBUG_INFO_UPDATE_LOC();
1674 return createCast(Ice::InstCast::Fptoui, v, destType);
1675 }
1676
createFPToSI(Value * v,Type * destType)1677 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1678 {
1679 RR_DEBUG_INFO_UPDATE_LOC();
1680 return createCast(Ice::InstCast::Fptosi, v, destType);
1681 }
1682
createSIToFP(Value * v,Type * destType)1683 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1684 {
1685 RR_DEBUG_INFO_UPDATE_LOC();
1686 return createCast(Ice::InstCast::Sitofp, v, destType);
1687 }
1688
createFPTrunc(Value * v,Type * destType)1689 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1690 {
1691 RR_DEBUG_INFO_UPDATE_LOC();
1692 return createCast(Ice::InstCast::Fptrunc, v, destType);
1693 }
1694
createFPExt(Value * v,Type * destType)1695 Value *Nucleus::createFPExt(Value *v, Type *destType)
1696 {
1697 RR_DEBUG_INFO_UPDATE_LOC();
1698 return createCast(Ice::InstCast::Fpext, v, destType);
1699 }
1700
createBitCast(Value * v,Type * destType)1701 Value *Nucleus::createBitCast(Value *v, Type *destType)
1702 {
1703 RR_DEBUG_INFO_UPDATE_LOC();
1704 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1705 // support for casting between scalars and wide vectors. For platforms where this is not supported,
1706 // emulate them by writing to the stack and reading back as the destination type.
1707 if(emulateMismatchedBitCast)
1708 {
1709 if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
1710 {
1711 Value *address = allocateStackVariable(destType);
1712 createStore(v, address, T(v->getType()));
1713 return createLoad(address, destType);
1714 }
1715 else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
1716 {
1717 Value *address = allocateStackVariable(T(v->getType()));
1718 createStore(v, address, T(v->getType()));
1719 return createLoad(address, destType);
1720 }
1721 }
1722
1723 return createCast(Ice::InstCast::Bitcast, v, destType);
1724 }
1725
createIntCompare(Ice::InstIcmp::ICond condition,Value * lhs,Value * rhs)1726 static Value *createIntCompare(Ice::InstIcmp::ICond condition, Value *lhs, Value *rhs)
1727 {
1728 ASSERT(lhs->getType() == rhs->getType());
1729
1730 auto result = ::function->makeVariable(Ice::isScalarIntegerType(lhs->getType()) ? Ice::IceType_i1 : lhs->getType());
1731 auto cmp = Ice::InstIcmp::create(::function, condition, result, lhs, rhs);
1732 ::basicBlock->appendInst(cmp);
1733
1734 return V(result);
1735 }
1736
createICmpEQ(Value * lhs,Value * rhs)1737 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1738 {
1739 RR_DEBUG_INFO_UPDATE_LOC();
1740 return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
1741 }
1742
createICmpNE(Value * lhs,Value * rhs)1743 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1744 {
1745 RR_DEBUG_INFO_UPDATE_LOC();
1746 return createIntCompare(Ice::InstIcmp::Ne, lhs, rhs);
1747 }
1748
createICmpUGT(Value * lhs,Value * rhs)1749 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1750 {
1751 RR_DEBUG_INFO_UPDATE_LOC();
1752 return createIntCompare(Ice::InstIcmp::Ugt, lhs, rhs);
1753 }
1754
createICmpUGE(Value * lhs,Value * rhs)1755 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1756 {
1757 RR_DEBUG_INFO_UPDATE_LOC();
1758 return createIntCompare(Ice::InstIcmp::Uge, lhs, rhs);
1759 }
1760
createICmpULT(Value * lhs,Value * rhs)1761 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1762 {
1763 RR_DEBUG_INFO_UPDATE_LOC();
1764 return createIntCompare(Ice::InstIcmp::Ult, lhs, rhs);
1765 }
1766
createICmpULE(Value * lhs,Value * rhs)1767 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1768 {
1769 RR_DEBUG_INFO_UPDATE_LOC();
1770 return createIntCompare(Ice::InstIcmp::Ule, lhs, rhs);
1771 }
1772
createICmpSGT(Value * lhs,Value * rhs)1773 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1774 {
1775 RR_DEBUG_INFO_UPDATE_LOC();
1776 return createIntCompare(Ice::InstIcmp::Sgt, lhs, rhs);
1777 }
1778
createICmpSGE(Value * lhs,Value * rhs)1779 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1780 {
1781 RR_DEBUG_INFO_UPDATE_LOC();
1782 return createIntCompare(Ice::InstIcmp::Sge, lhs, rhs);
1783 }
1784
createICmpSLT(Value * lhs,Value * rhs)1785 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1786 {
1787 RR_DEBUG_INFO_UPDATE_LOC();
1788 return createIntCompare(Ice::InstIcmp::Slt, lhs, rhs);
1789 }
1790
createICmpSLE(Value * lhs,Value * rhs)1791 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1792 {
1793 RR_DEBUG_INFO_UPDATE_LOC();
1794 return createIntCompare(Ice::InstIcmp::Sle, lhs, rhs);
1795 }
1796
createFloatCompare(Ice::InstFcmp::FCond condition,Value * lhs,Value * rhs)1797 static Value *createFloatCompare(Ice::InstFcmp::FCond condition, Value *lhs, Value *rhs)
1798 {
1799 ASSERT(lhs->getType() == rhs->getType());
1800 ASSERT(Ice::isScalarFloatingType(lhs->getType()) || lhs->getType() == Ice::IceType_v4f32);
1801
1802 auto result = ::function->makeVariable(Ice::isScalarFloatingType(lhs->getType()) ? Ice::IceType_i1 : Ice::IceType_v4i32);
1803 auto cmp = Ice::InstFcmp::create(::function, condition, result, lhs, rhs);
1804 ::basicBlock->appendInst(cmp);
1805
1806 return V(result);
1807 }
1808
createFCmpOEQ(Value * lhs,Value * rhs)1809 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1810 {
1811 RR_DEBUG_INFO_UPDATE_LOC();
1812 return createFloatCompare(Ice::InstFcmp::Oeq, lhs, rhs);
1813 }
1814
createFCmpOGT(Value * lhs,Value * rhs)1815 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1816 {
1817 RR_DEBUG_INFO_UPDATE_LOC();
1818 return createFloatCompare(Ice::InstFcmp::Ogt, lhs, rhs);
1819 }
1820
createFCmpOGE(Value * lhs,Value * rhs)1821 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1822 {
1823 RR_DEBUG_INFO_UPDATE_LOC();
1824 return createFloatCompare(Ice::InstFcmp::Oge, lhs, rhs);
1825 }
1826
createFCmpOLT(Value * lhs,Value * rhs)1827 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1828 {
1829 RR_DEBUG_INFO_UPDATE_LOC();
1830 return createFloatCompare(Ice::InstFcmp::Olt, lhs, rhs);
1831 }
1832
createFCmpOLE(Value * lhs,Value * rhs)1833 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1834 {
1835 RR_DEBUG_INFO_UPDATE_LOC();
1836 return createFloatCompare(Ice::InstFcmp::Ole, lhs, rhs);
1837 }
1838
createFCmpONE(Value * lhs,Value * rhs)1839 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1840 {
1841 RR_DEBUG_INFO_UPDATE_LOC();
1842 return createFloatCompare(Ice::InstFcmp::One, lhs, rhs);
1843 }
1844
createFCmpORD(Value * lhs,Value * rhs)1845 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1846 {
1847 RR_DEBUG_INFO_UPDATE_LOC();
1848 return createFloatCompare(Ice::InstFcmp::Ord, lhs, rhs);
1849 }
1850
createFCmpUNO(Value * lhs,Value * rhs)1851 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1852 {
1853 RR_DEBUG_INFO_UPDATE_LOC();
1854 return createFloatCompare(Ice::InstFcmp::Uno, lhs, rhs);
1855 }
1856
createFCmpUEQ(Value * lhs,Value * rhs)1857 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1858 {
1859 RR_DEBUG_INFO_UPDATE_LOC();
1860 return createFloatCompare(Ice::InstFcmp::Ueq, lhs, rhs);
1861 }
1862
createFCmpUGT(Value * lhs,Value * rhs)1863 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1864 {
1865 RR_DEBUG_INFO_UPDATE_LOC();
1866 return createFloatCompare(Ice::InstFcmp::Ugt, lhs, rhs);
1867 }
1868
createFCmpUGE(Value * lhs,Value * rhs)1869 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1870 {
1871 RR_DEBUG_INFO_UPDATE_LOC();
1872 return createFloatCompare(Ice::InstFcmp::Uge, lhs, rhs);
1873 }
1874
createFCmpULT(Value * lhs,Value * rhs)1875 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1876 {
1877 RR_DEBUG_INFO_UPDATE_LOC();
1878 return createFloatCompare(Ice::InstFcmp::Ult, lhs, rhs);
1879 }
1880
createFCmpULE(Value * lhs,Value * rhs)1881 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1882 {
1883 RR_DEBUG_INFO_UPDATE_LOC();
1884 return createFloatCompare(Ice::InstFcmp::Ule, lhs, rhs);
1885 }
1886
createFCmpUNE(Value * lhs,Value * rhs)1887 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1888 {
1889 RR_DEBUG_INFO_UPDATE_LOC();
1890 return createFloatCompare(Ice::InstFcmp::Une, lhs, rhs);
1891 }
1892
createExtractElement(Value * vector,Type * type,int index)1893 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1894 {
1895 RR_DEBUG_INFO_UPDATE_LOC();
1896 auto result = ::function->makeVariable(T(type));
1897 auto extract = Ice::InstExtractElement::create(::function, result, V(vector), ::context->getConstantInt32(index));
1898 ::basicBlock->appendInst(extract);
1899
1900 return V(result);
1901 }
1902
createInsertElement(Value * vector,Value * element,int index)1903 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1904 {
1905 RR_DEBUG_INFO_UPDATE_LOC();
1906 auto result = ::function->makeVariable(vector->getType());
1907 auto insert = Ice::InstInsertElement::create(::function, result, vector, element, ::context->getConstantInt32(index));
1908 ::basicBlock->appendInst(insert);
1909
1910 return V(result);
1911 }
1912
createShuffleVector(Value * V1,Value * V2,const int * select)1913 Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
1914 {
1915 RR_DEBUG_INFO_UPDATE_LOC();
1916 ASSERT(V1->getType() == V2->getType());
1917
1918 int size = Ice::typeNumElements(V1->getType());
1919 auto result = ::function->makeVariable(V1->getType());
1920 auto shuffle = Ice::InstShuffleVector::create(::function, result, V1, V2);
1921
1922 for(int i = 0; i < size; i++)
1923 {
1924 shuffle->addIndex(llvm::cast<Ice::ConstantInteger32>(::context->getConstantInt32(select[i])));
1925 }
1926
1927 ::basicBlock->appendInst(shuffle);
1928
1929 return V(result);
1930 }
1931
createSelect(Value * C,Value * ifTrue,Value * ifFalse)1932 Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
1933 {
1934 RR_DEBUG_INFO_UPDATE_LOC();
1935 ASSERT(ifTrue->getType() == ifFalse->getType());
1936
1937 auto result = ::function->makeVariable(ifTrue->getType());
1938 auto *select = Ice::InstSelect::create(::function, result, C, ifTrue, ifFalse);
1939 ::basicBlock->appendInst(select);
1940
1941 return V(result);
1942 }
1943
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1944 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1945 {
1946 RR_DEBUG_INFO_UPDATE_LOC();
1947 auto switchInst = Ice::InstSwitch::create(::function, numCases, control, defaultBranch);
1948 ::basicBlock->appendInst(switchInst);
1949
1950 return reinterpret_cast<SwitchCases *>(switchInst);
1951 }
1952
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1953 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1954 {
1955 RR_DEBUG_INFO_UPDATE_LOC();
1956 switchCases->addBranch(label, label, branch);
1957 }
1958
createUnreachable()1959 void Nucleus::createUnreachable()
1960 {
1961 RR_DEBUG_INFO_UPDATE_LOC();
1962 Ice::InstUnreachable *unreachable = Ice::InstUnreachable::create(::function);
1963 ::basicBlock->appendInst(unreachable);
1964 }
1965
getType(Value * value)1966 Type *Nucleus::getType(Value *value)
1967 {
1968 return T(V(value)->getType());
1969 }
1970
getContainedType(Type * vectorType)1971 Type *Nucleus::getContainedType(Type *vectorType)
1972 {
1973 Ice::Type vecTy = T(vectorType);
1974 switch(vecTy)
1975 {
1976 case Ice::IceType_v4i1: return T(Ice::IceType_i1);
1977 case Ice::IceType_v8i1: return T(Ice::IceType_i1);
1978 case Ice::IceType_v16i1: return T(Ice::IceType_i1);
1979 case Ice::IceType_v16i8: return T(Ice::IceType_i8);
1980 case Ice::IceType_v8i16: return T(Ice::IceType_i16);
1981 case Ice::IceType_v4i32: return T(Ice::IceType_i32);
1982 case Ice::IceType_v4f32: return T(Ice::IceType_f32);
1983 default:
1984 ASSERT_MSG(false, "getContainedType: input type is not a vector type");
1985 return {};
1986 }
1987 }
1988
getPointerType(Type * ElementType)1989 Type *Nucleus::getPointerType(Type *ElementType)
1990 {
1991 return T(sz::getPointerType(T(ElementType)));
1992 }
1993
getNaturalIntType()1994 static constexpr Ice::Type getNaturalIntType()
1995 {
1996 constexpr size_t intSize = sizeof(int);
1997 static_assert(intSize == 4 || intSize == 8, "");
1998 return intSize == 4 ? Ice::IceType_i32 : Ice::IceType_i64;
1999 }
2000
getPrintfStorageType(Type * valueType)2001 Type *Nucleus::getPrintfStorageType(Type *valueType)
2002 {
2003 Ice::Type valueTy = T(valueType);
2004 switch(valueTy)
2005 {
2006 case Ice::IceType_i32:
2007 return T(getNaturalIntType());
2008
2009 case Ice::IceType_f32:
2010 return T(Ice::IceType_f64);
2011
2012 default:
2013 UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
2014 return {};
2015 }
2016 }
2017
createNullValue(Type * Ty)2018 Value *Nucleus::createNullValue(Type *Ty)
2019 {
2020 RR_DEBUG_INFO_UPDATE_LOC();
2021 if(Ice::isVectorType(T(Ty)))
2022 {
2023 ASSERT(Ice::typeNumElements(T(Ty)) <= 16);
2024 int64_t c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2025 return createConstantVector(c, Ty);
2026 }
2027 else
2028 {
2029 return V(::context->getConstantZero(T(Ty)));
2030 }
2031 }
2032
createConstantLong(int64_t i)2033 Value *Nucleus::createConstantLong(int64_t i)
2034 {
2035 RR_DEBUG_INFO_UPDATE_LOC();
2036 return V(::context->getConstantInt64(i));
2037 }
2038
createConstantInt(int i)2039 Value *Nucleus::createConstantInt(int i)
2040 {
2041 RR_DEBUG_INFO_UPDATE_LOC();
2042 return V(::context->getConstantInt32(i));
2043 }
2044
createConstantInt(unsigned int i)2045 Value *Nucleus::createConstantInt(unsigned int i)
2046 {
2047 RR_DEBUG_INFO_UPDATE_LOC();
2048 return V(::context->getConstantInt32(i));
2049 }
2050
createConstantBool(bool b)2051 Value *Nucleus::createConstantBool(bool b)
2052 {
2053 RR_DEBUG_INFO_UPDATE_LOC();
2054 return V(::context->getConstantInt1(b));
2055 }
2056
createConstantByte(signed char i)2057 Value *Nucleus::createConstantByte(signed char i)
2058 {
2059 RR_DEBUG_INFO_UPDATE_LOC();
2060 return V(::context->getConstantInt8(i));
2061 }
2062
createConstantByte(unsigned char i)2063 Value *Nucleus::createConstantByte(unsigned char i)
2064 {
2065 RR_DEBUG_INFO_UPDATE_LOC();
2066 return V(::context->getConstantInt8(i));
2067 }
2068
createConstantShort(short i)2069 Value *Nucleus::createConstantShort(short i)
2070 {
2071 RR_DEBUG_INFO_UPDATE_LOC();
2072 return V(::context->getConstantInt16(i));
2073 }
2074
createConstantShort(unsigned short i)2075 Value *Nucleus::createConstantShort(unsigned short i)
2076 {
2077 RR_DEBUG_INFO_UPDATE_LOC();
2078 return V(::context->getConstantInt16(i));
2079 }
2080
createConstantFloat(float x)2081 Value *Nucleus::createConstantFloat(float x)
2082 {
2083 RR_DEBUG_INFO_UPDATE_LOC();
2084 return V(::context->getConstantFloat(x));
2085 }
2086
createNullPointer(Type * Ty)2087 Value *Nucleus::createNullPointer(Type *Ty)
2088 {
2089 RR_DEBUG_INFO_UPDATE_LOC();
2090 return createNullValue(T(sizeof(void *) == 8 ? Ice::IceType_i64 : Ice::IceType_i32));
2091 }
2092
IceConstantData(void const * data,size_t size,size_t alignment=1)2093 static Ice::Constant *IceConstantData(void const *data, size_t size, size_t alignment = 1)
2094 {
2095 return sz::getConstantPointer(::context, ::routine->addConstantData(data, size, alignment));
2096 }
2097
createConstantVector(const int64_t * constants,Type * type)2098 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
2099 {
2100 RR_DEBUG_INFO_UPDATE_LOC();
2101 const int vectorSize = 16;
2102 ASSERT(Ice::typeWidthInBytes(T(type)) == vectorSize);
2103 const int alignment = vectorSize;
2104
2105 const int64_t *i = constants;
2106 const double *f = reinterpret_cast<const double *>(constants);
2107
2108 // TODO(b/148082873): Fix global variable constants when generating multiple functions
2109 Ice::Constant *ptr = nullptr;
2110
2111 switch((int)reinterpret_cast<intptr_t>(type))
2112 {
2113 case Ice::IceType_v4i32:
2114 case Ice::IceType_v4i1:
2115 {
2116 const int initializer[4] = { (int)i[0], (int)i[1], (int)i[2], (int)i[3] };
2117 static_assert(sizeof(initializer) == vectorSize, "!");
2118 ptr = IceConstantData(initializer, vectorSize, alignment);
2119 }
2120 break;
2121 case Ice::IceType_v4f32:
2122 {
2123 const float initializer[4] = { (float)f[0], (float)f[1], (float)f[2], (float)f[3] };
2124 static_assert(sizeof(initializer) == vectorSize, "!");
2125 ptr = IceConstantData(initializer, vectorSize, alignment);
2126 }
2127 break;
2128 case Ice::IceType_v8i16:
2129 case Ice::IceType_v8i1:
2130 {
2131 const short initializer[8] = { (short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[4], (short)i[5], (short)i[6], (short)i[7] };
2132 static_assert(sizeof(initializer) == vectorSize, "!");
2133 ptr = IceConstantData(initializer, vectorSize, alignment);
2134 }
2135 break;
2136 case Ice::IceType_v16i8:
2137 case Ice::IceType_v16i1:
2138 {
2139 const char initializer[16] = { (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[8], (char)i[9], (char)i[10], (char)i[11], (char)i[12], (char)i[13], (char)i[14], (char)i[15] };
2140 static_assert(sizeof(initializer) == vectorSize, "!");
2141 ptr = IceConstantData(initializer, vectorSize, alignment);
2142 }
2143 break;
2144 case Type_v2i32:
2145 {
2146 const int initializer[4] = { (int)i[0], (int)i[1], (int)i[0], (int)i[1] };
2147 static_assert(sizeof(initializer) == vectorSize, "!");
2148 ptr = IceConstantData(initializer, vectorSize, alignment);
2149 }
2150 break;
2151 case Type_v2f32:
2152 {
2153 const float initializer[4] = { (float)f[0], (float)f[1], (float)f[0], (float)f[1] };
2154 static_assert(sizeof(initializer) == vectorSize, "!");
2155 ptr = IceConstantData(initializer, vectorSize, alignment);
2156 }
2157 break;
2158 case Type_v4i16:
2159 {
2160 const short initializer[8] = { (short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[0], (short)i[1], (short)i[2], (short)i[3] };
2161 static_assert(sizeof(initializer) == vectorSize, "!");
2162 ptr = IceConstantData(initializer, vectorSize, alignment);
2163 }
2164 break;
2165 case Type_v8i8:
2166 {
2167 const char initializer[16] = { (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7] };
2168 static_assert(sizeof(initializer) == vectorSize, "!");
2169 ptr = IceConstantData(initializer, vectorSize, alignment);
2170 }
2171 break;
2172 case Type_v4i8:
2173 {
2174 const char initializer[16] = { (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3] };
2175 static_assert(sizeof(initializer) == vectorSize, "!");
2176 ptr = IceConstantData(initializer, vectorSize, alignment);
2177 }
2178 break;
2179 default:
2180 UNREACHABLE("Unknown constant vector type: %d", (int)reinterpret_cast<intptr_t>(type));
2181 }
2182
2183 ASSERT(ptr);
2184
2185 Ice::Variable *result = sz::createLoad(::function, ::basicBlock, ptr, T(type), alignment);
2186 return V(result);
2187 }
2188
createConstantVector(const double * constants,Type * type)2189 Value *Nucleus::createConstantVector(const double *constants, Type *type)
2190 {
2191 return createConstantVector((const int64_t *)constants, type);
2192 }
2193
createConstantString(const char * v)2194 Value *Nucleus::createConstantString(const char *v)
2195 {
2196 // NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
2197 return V(IceConstantData(v, strlen(v) + 1));
2198 }
2199
setOptimizerCallback(OptimizerCallback * callback)2200 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
2201 {
2202 ::optimizerCallback = callback;
2203 }
2204
type()2205 Type *Void::type()
2206 {
2207 return T(Ice::IceType_void);
2208 }
2209
type()2210 Type *Bool::type()
2211 {
2212 return T(Ice::IceType_i1);
2213 }
2214
type()2215 Type *Byte::type()
2216 {
2217 return T(Ice::IceType_i8);
2218 }
2219
type()2220 Type *SByte::type()
2221 {
2222 return T(Ice::IceType_i8);
2223 }
2224
type()2225 Type *Short::type()
2226 {
2227 return T(Ice::IceType_i16);
2228 }
2229
type()2230 Type *UShort::type()
2231 {
2232 return T(Ice::IceType_i16);
2233 }
2234
type()2235 Type *Byte4::type()
2236 {
2237 return T(Type_v4i8);
2238 }
2239
type()2240 Type *SByte4::type()
2241 {
2242 return T(Type_v4i8);
2243 }
2244
2245 namespace {
SaturateUnsigned(RValue<Short> x)2246 RValue<Byte> SaturateUnsigned(RValue<Short> x)
2247 {
2248 return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), IfThenElse(Int(x) < 0, Int(0), Int(x))));
2249 }
2250
Extract(RValue<Byte8> val,int i)2251 RValue<Byte> Extract(RValue<Byte8> val, int i)
2252 {
2253 return RValue<Byte>(Nucleus::createExtractElement(val.value(), Byte::type(), i));
2254 }
2255
Insert(RValue<Byte8> val,RValue<Byte> element,int i)2256 RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
2257 {
2258 return RValue<Byte8>(Nucleus::createInsertElement(val.value(), element.value(), i));
2259 }
2260 } // namespace
2261
AddSat(RValue<Byte8> x,RValue<Byte8> y)2262 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
2263 {
2264 RR_DEBUG_INFO_UPDATE_LOC();
2265 if(emulateIntrinsics)
2266 {
2267 Byte8 result;
2268 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
2269 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
2270 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
2271 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
2272 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
2273 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
2274 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
2275 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
2276
2277 return result;
2278 }
2279 else
2280 {
2281 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2282 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2283 auto paddusb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2284 paddusb->addArg(x.value());
2285 paddusb->addArg(y.value());
2286 ::basicBlock->appendInst(paddusb);
2287
2288 return RValue<Byte8>(V(result));
2289 }
2290 }
2291
SubSat(RValue<Byte8> x,RValue<Byte8> y)2292 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
2293 {
2294 RR_DEBUG_INFO_UPDATE_LOC();
2295 if(emulateIntrinsics)
2296 {
2297 Byte8 result;
2298 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
2299 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
2300 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
2301 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
2302 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
2303 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
2304 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
2305 result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
2306
2307 return result;
2308 }
2309 else
2310 {
2311 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2312 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2313 auto psubusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2314 psubusw->addArg(x.value());
2315 psubusw->addArg(y.value());
2316 ::basicBlock->appendInst(psubusw);
2317
2318 return RValue<Byte8>(V(result));
2319 }
2320 }
2321
Extract(RValue<SByte8> val,int i)2322 RValue<SByte> Extract(RValue<SByte8> val, int i)
2323 {
2324 RR_DEBUG_INFO_UPDATE_LOC();
2325 return RValue<SByte>(Nucleus::createExtractElement(val.value(), SByte::type(), i));
2326 }
2327
Insert(RValue<SByte8> val,RValue<SByte> element,int i)2328 RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
2329 {
2330 RR_DEBUG_INFO_UPDATE_LOC();
2331 return RValue<SByte8>(Nucleus::createInsertElement(val.value(), element.value(), i));
2332 }
2333
operator >>(RValue<SByte8> lhs,unsigned char rhs)2334 RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2335 {
2336 RR_DEBUG_INFO_UPDATE_LOC();
2337 if(emulateIntrinsics)
2338 {
2339 SByte8 result;
2340 result = Insert(result, Extract(lhs, 0) >> SByte(rhs), 0);
2341 result = Insert(result, Extract(lhs, 1) >> SByte(rhs), 1);
2342 result = Insert(result, Extract(lhs, 2) >> SByte(rhs), 2);
2343 result = Insert(result, Extract(lhs, 3) >> SByte(rhs), 3);
2344 result = Insert(result, Extract(lhs, 4) >> SByte(rhs), 4);
2345 result = Insert(result, Extract(lhs, 5) >> SByte(rhs), 5);
2346 result = Insert(result, Extract(lhs, 6) >> SByte(rhs), 6);
2347 result = Insert(result, Extract(lhs, 7) >> SByte(rhs), 7);
2348
2349 return result;
2350 }
2351 else
2352 {
2353 #if defined(__i386__) || defined(__x86_64__)
2354 // SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
2355 RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00u);
2356 RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
2357
2358 return As<SByte8>(hi | lo);
2359 #else
2360 return RValue<SByte8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2361 #endif
2362 }
2363 }
2364
SignMask(RValue<Byte8> x)2365 RValue<Int> SignMask(RValue<Byte8> x)
2366 {
2367 RR_DEBUG_INFO_UPDATE_LOC();
2368 if(emulateIntrinsics || CPUID::ARM)
2369 {
2370 Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
2371 return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
2372 }
2373 else
2374 {
2375 Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
2376 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2377 auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
2378 movmsk->addArg(x.value());
2379 ::basicBlock->appendInst(movmsk);
2380
2381 return RValue<Int>(V(result)) & 0xFF;
2382 }
2383 }
2384
2385 // RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
2386 // {
2387 // return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Ugt, x.value(), y.value()));
2388 // }
2389
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)2390 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
2391 {
2392 RR_DEBUG_INFO_UPDATE_LOC();
2393 return RValue<Byte8>(Nucleus::createICmpEQ(x.value(), y.value()));
2394 }
2395
type()2396 Type *Byte8::type()
2397 {
2398 return T(Type_v8i8);
2399 }
2400
2401 // RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
2402 // {
2403 // return RValue<SByte8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2404 // }
2405
2406 // RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2407 // {
2408 // return RValue<SByte8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2409 // }
2410
SaturateSigned(RValue<Short> x)2411 RValue<SByte> SaturateSigned(RValue<Short> x)
2412 {
2413 RR_DEBUG_INFO_UPDATE_LOC();
2414 return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
2415 }
2416
AddSat(RValue<SByte8> x,RValue<SByte8> y)2417 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
2418 {
2419 RR_DEBUG_INFO_UPDATE_LOC();
2420 if(emulateIntrinsics)
2421 {
2422 SByte8 result;
2423 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
2424 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
2425 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
2426 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
2427 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
2428 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
2429 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
2430 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
2431
2432 return result;
2433 }
2434 else
2435 {
2436 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2437 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2438 auto paddsb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2439 paddsb->addArg(x.value());
2440 paddsb->addArg(y.value());
2441 ::basicBlock->appendInst(paddsb);
2442
2443 return RValue<SByte8>(V(result));
2444 }
2445 }
2446
SubSat(RValue<SByte8> x,RValue<SByte8> y)2447 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
2448 {
2449 RR_DEBUG_INFO_UPDATE_LOC();
2450 if(emulateIntrinsics)
2451 {
2452 SByte8 result;
2453 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
2454 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
2455 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
2456 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
2457 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
2458 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
2459 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
2460 result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
2461
2462 return result;
2463 }
2464 else
2465 {
2466 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2467 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2468 auto psubsb = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2469 psubsb->addArg(x.value());
2470 psubsb->addArg(y.value());
2471 ::basicBlock->appendInst(psubsb);
2472
2473 return RValue<SByte8>(V(result));
2474 }
2475 }
2476
SignMask(RValue<SByte8> x)2477 RValue<Int> SignMask(RValue<SByte8> x)
2478 {
2479 RR_DEBUG_INFO_UPDATE_LOC();
2480 if(emulateIntrinsics || CPUID::ARM)
2481 {
2482 SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
2483 return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
2484 }
2485 else
2486 {
2487 Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
2488 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2489 auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
2490 movmsk->addArg(x.value());
2491 ::basicBlock->appendInst(movmsk);
2492
2493 return RValue<Int>(V(result)) & 0xFF;
2494 }
2495 }
2496
CmpGT(RValue<SByte8> x,RValue<SByte8> y)2497 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
2498 {
2499 RR_DEBUG_INFO_UPDATE_LOC();
2500 return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Sgt, x.value(), y.value()));
2501 }
2502
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)2503 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
2504 {
2505 RR_DEBUG_INFO_UPDATE_LOC();
2506 return RValue<Byte8>(Nucleus::createICmpEQ(x.value(), y.value()));
2507 }
2508
type()2509 Type *SByte8::type()
2510 {
2511 return T(Type_v8i8);
2512 }
2513
type()2514 Type *Byte16::type()
2515 {
2516 return T(Ice::IceType_v16i8);
2517 }
2518
type()2519 Type *SByte16::type()
2520 {
2521 return T(Ice::IceType_v16i8);
2522 }
2523
type()2524 Type *Short2::type()
2525 {
2526 return T(Type_v2i16);
2527 }
2528
type()2529 Type *UShort2::type()
2530 {
2531 return T(Type_v2i16);
2532 }
2533
Short4(RValue<Int4> cast)2534 Short4::Short4(RValue<Int4> cast)
2535 {
2536 int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
2537 Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2538 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2539
2540 Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value();
2541 Value *short4 = Nucleus::createBitCast(int2, Short4::type());
2542
2543 storeValue(short4);
2544 }
2545
2546 // Short4::Short4(RValue<Float> cast)
2547 // {
2548 // }
2549
Short4(RValue<Float4> cast)2550 Short4::Short4(RValue<Float4> cast)
2551 {
2552 // TODO(b/150791192): Generalize and optimize
2553 auto smin = std::numeric_limits<short>::min();
2554 auto smax = std::numeric_limits<short>::max();
2555 *this = Short4(Int4(Max(Min(cast, Float4(smax)), Float4(smin))));
2556 }
2557
operator <<(RValue<Short4> lhs,unsigned char rhs)2558 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2559 {
2560 RR_DEBUG_INFO_UPDATE_LOC();
2561 if(emulateIntrinsics)
2562 {
2563 Short4 result;
2564 result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
2565 result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
2566 result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
2567 result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
2568
2569 return result;
2570 }
2571 else
2572 {
2573 return RValue<Short4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2574 }
2575 }
2576
operator >>(RValue<Short4> lhs,unsigned char rhs)2577 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2578 {
2579 RR_DEBUG_INFO_UPDATE_LOC();
2580 if(emulateIntrinsics)
2581 {
2582 Short4 result;
2583 result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
2584 result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
2585 result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
2586 result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
2587
2588 return result;
2589 }
2590 else
2591 {
2592 return RValue<Short4>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2593 }
2594 }
2595
Max(RValue<Short4> x,RValue<Short4> y)2596 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2597 {
2598 RR_DEBUG_INFO_UPDATE_LOC();
2599 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2600 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value(), y.value());
2601 ::basicBlock->appendInst(cmp);
2602
2603 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2604 auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2605 ::basicBlock->appendInst(select);
2606
2607 return RValue<Short4>(V(result));
2608 }
2609
Min(RValue<Short4> x,RValue<Short4> y)2610 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2611 {
2612 RR_DEBUG_INFO_UPDATE_LOC();
2613 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2614 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value(), y.value());
2615 ::basicBlock->appendInst(cmp);
2616
2617 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2618 auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2619 ::basicBlock->appendInst(select);
2620
2621 return RValue<Short4>(V(result));
2622 }
2623
SaturateSigned(RValue<Int> x)2624 RValue<Short> SaturateSigned(RValue<Int> x)
2625 {
2626 RR_DEBUG_INFO_UPDATE_LOC();
2627 return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
2628 }
2629
AddSat(RValue<Short4> x,RValue<Short4> y)2630 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2631 {
2632 RR_DEBUG_INFO_UPDATE_LOC();
2633 if(emulateIntrinsics)
2634 {
2635 Short4 result;
2636 result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
2637 result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
2638 result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
2639 result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
2640
2641 return result;
2642 }
2643 else
2644 {
2645 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2646 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2647 auto paddsw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2648 paddsw->addArg(x.value());
2649 paddsw->addArg(y.value());
2650 ::basicBlock->appendInst(paddsw);
2651
2652 return RValue<Short4>(V(result));
2653 }
2654 }
2655
SubSat(RValue<Short4> x,RValue<Short4> y)2656 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2657 {
2658 RR_DEBUG_INFO_UPDATE_LOC();
2659 if(emulateIntrinsics)
2660 {
2661 Short4 result;
2662 result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
2663 result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
2664 result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
2665 result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
2666
2667 return result;
2668 }
2669 else
2670 {
2671 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2672 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2673 auto psubsw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2674 psubsw->addArg(x.value());
2675 psubsw->addArg(y.value());
2676 ::basicBlock->appendInst(psubsw);
2677
2678 return RValue<Short4>(V(result));
2679 }
2680 }
2681
MulHigh(RValue<Short4> x,RValue<Short4> y)2682 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2683 {
2684 RR_DEBUG_INFO_UPDATE_LOC();
2685 if(emulateIntrinsics)
2686 {
2687 Short4 result;
2688 result = Insert(result, Short((Int(Extract(x, 0)) * Int(Extract(y, 0))) >> 16), 0);
2689 result = Insert(result, Short((Int(Extract(x, 1)) * Int(Extract(y, 1))) >> 16), 1);
2690 result = Insert(result, Short((Int(Extract(x, 2)) * Int(Extract(y, 2))) >> 16), 2);
2691 result = Insert(result, Short((Int(Extract(x, 3)) * Int(Extract(y, 3))) >> 16), 3);
2692
2693 return result;
2694 }
2695 else
2696 {
2697 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2698 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2699 auto pmulhw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2700 pmulhw->addArg(x.value());
2701 pmulhw->addArg(y.value());
2702 ::basicBlock->appendInst(pmulhw);
2703
2704 return RValue<Short4>(V(result));
2705 }
2706 }
2707
MulAdd(RValue<Short4> x,RValue<Short4> y)2708 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2709 {
2710 RR_DEBUG_INFO_UPDATE_LOC();
2711 if(emulateIntrinsics)
2712 {
2713 Int2 result;
2714 result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
2715 result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
2716
2717 return result;
2718 }
2719 else
2720 {
2721 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2722 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2723 auto pmaddwd = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2724 pmaddwd->addArg(x.value());
2725 pmaddwd->addArg(y.value());
2726 ::basicBlock->appendInst(pmaddwd);
2727
2728 return As<Int2>(V(result));
2729 }
2730 }
2731
PackSigned(RValue<Short4> x,RValue<Short4> y)2732 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2733 {
2734 RR_DEBUG_INFO_UPDATE_LOC();
2735 if(emulateIntrinsics)
2736 {
2737 SByte8 result;
2738 result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
2739 result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
2740 result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
2741 result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
2742 result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
2743 result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
2744 result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
2745 result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
2746
2747 return result;
2748 }
2749 else
2750 {
2751 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2752 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2753 auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2754 pack->addArg(x.value());
2755 pack->addArg(y.value());
2756 ::basicBlock->appendInst(pack);
2757
2758 return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x0202));
2759 }
2760 }
2761
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2762 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2763 {
2764 RR_DEBUG_INFO_UPDATE_LOC();
2765 if(emulateIntrinsics)
2766 {
2767 Byte8 result;
2768 result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
2769 result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
2770 result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
2771 result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
2772 result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
2773 result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
2774 result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
2775 result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
2776
2777 return result;
2778 }
2779 else
2780 {
2781 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2782 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2783 auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2784 pack->addArg(x.value());
2785 pack->addArg(y.value());
2786 ::basicBlock->appendInst(pack);
2787
2788 return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x0202));
2789 }
2790 }
2791
CmpGT(RValue<Short4> x,RValue<Short4> y)2792 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2793 {
2794 RR_DEBUG_INFO_UPDATE_LOC();
2795 return RValue<Short4>(createIntCompare(Ice::InstIcmp::Sgt, x.value(), y.value()));
2796 }
2797
CmpEQ(RValue<Short4> x,RValue<Short4> y)2798 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2799 {
2800 RR_DEBUG_INFO_UPDATE_LOC();
2801 return RValue<Short4>(Nucleus::createICmpEQ(x.value(), y.value()));
2802 }
2803
type()2804 Type *Short4::type()
2805 {
2806 return T(Type_v4i16);
2807 }
2808
UShort4(RValue<Float4> cast,bool saturate)2809 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2810 {
2811 if(saturate)
2812 {
2813 if(CPUID::SSE4_1)
2814 {
2815 // x86 produces 0x80000000 on 32-bit integer overflow/underflow.
2816 // PackUnsigned takes care of 0x0000 saturation.
2817 Int4 int4(Min(cast, Float4(0xFFFF)));
2818 *this = As<UShort4>(PackUnsigned(int4, int4));
2819 }
2820 else if(CPUID::ARM)
2821 {
2822 // ARM saturates the 32-bit integer result on overflow/undeflow.
2823 Int4 int4(cast);
2824 *this = As<UShort4>(PackUnsigned(int4, int4));
2825 }
2826 else
2827 {
2828 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2829 }
2830 }
2831 else
2832 {
2833 *this = Short4(Int4(cast));
2834 }
2835 }
2836
Extract(RValue<UShort4> val,int i)2837 RValue<UShort> Extract(RValue<UShort4> val, int i)
2838 {
2839 return RValue<UShort>(Nucleus::createExtractElement(val.value(), UShort::type(), i));
2840 }
2841
Insert(RValue<UShort4> val,RValue<UShort> element,int i)2842 RValue<UShort4> Insert(RValue<UShort4> val, RValue<UShort> element, int i)
2843 {
2844 return RValue<UShort4>(Nucleus::createInsertElement(val.value(), element.value(), i));
2845 }
2846
operator <<(RValue<UShort4> lhs,unsigned char rhs)2847 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2848 {
2849 RR_DEBUG_INFO_UPDATE_LOC();
2850 if(emulateIntrinsics)
2851
2852 {
2853 UShort4 result;
2854 result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
2855 result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
2856 result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
2857 result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
2858
2859 return result;
2860 }
2861 else
2862 {
2863 return RValue<UShort4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
2864 }
2865 }
2866
operator >>(RValue<UShort4> lhs,unsigned char rhs)2867 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2868 {
2869 RR_DEBUG_INFO_UPDATE_LOC();
2870 if(emulateIntrinsics)
2871 {
2872 UShort4 result;
2873 result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
2874 result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
2875 result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
2876 result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
2877
2878 return result;
2879 }
2880 else
2881 {
2882 return RValue<UShort4>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
2883 }
2884 }
2885
Max(RValue<UShort4> x,RValue<UShort4> y)2886 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2887 {
2888 RR_DEBUG_INFO_UPDATE_LOC();
2889 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2890 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value(), y.value());
2891 ::basicBlock->appendInst(cmp);
2892
2893 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2894 auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2895 ::basicBlock->appendInst(select);
2896
2897 return RValue<UShort4>(V(result));
2898 }
2899
Min(RValue<UShort4> x,RValue<UShort4> y)2900 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2901 {
2902 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2903 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value(), y.value());
2904 ::basicBlock->appendInst(cmp);
2905
2906 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2907 auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
2908 ::basicBlock->appendInst(select);
2909
2910 return RValue<UShort4>(V(result));
2911 }
2912
SaturateUnsigned(RValue<Int> x)2913 RValue<UShort> SaturateUnsigned(RValue<Int> x)
2914 {
2915 RR_DEBUG_INFO_UPDATE_LOC();
2916 return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
2917 }
2918
AddSat(RValue<UShort4> x,RValue<UShort4> y)2919 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2920 {
2921 RR_DEBUG_INFO_UPDATE_LOC();
2922 if(emulateIntrinsics)
2923 {
2924 UShort4 result;
2925 result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
2926 result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
2927 result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
2928 result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
2929
2930 return result;
2931 }
2932 else
2933 {
2934 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2935 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2936 auto paddusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2937 paddusw->addArg(x.value());
2938 paddusw->addArg(y.value());
2939 ::basicBlock->appendInst(paddusw);
2940
2941 return RValue<UShort4>(V(result));
2942 }
2943 }
2944
SubSat(RValue<UShort4> x,RValue<UShort4> y)2945 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2946 {
2947 RR_DEBUG_INFO_UPDATE_LOC();
2948 if(emulateIntrinsics)
2949 {
2950 UShort4 result;
2951 result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
2952 result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
2953 result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
2954 result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
2955
2956 return result;
2957 }
2958 else
2959 {
2960 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2961 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2962 auto psubusw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2963 psubusw->addArg(x.value());
2964 psubusw->addArg(y.value());
2965 ::basicBlock->appendInst(psubusw);
2966
2967 return RValue<UShort4>(V(result));
2968 }
2969 }
2970
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2971 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2972 {
2973 RR_DEBUG_INFO_UPDATE_LOC();
2974 if(emulateIntrinsics)
2975 {
2976 UShort4 result;
2977 result = Insert(result, UShort((UInt(Extract(x, 0)) * UInt(Extract(y, 0))) >> 16), 0);
2978 result = Insert(result, UShort((UInt(Extract(x, 1)) * UInt(Extract(y, 1))) >> 16), 1);
2979 result = Insert(result, UShort((UInt(Extract(x, 2)) * UInt(Extract(y, 2))) >> 16), 2);
2980 result = Insert(result, UShort((UInt(Extract(x, 3)) * UInt(Extract(y, 3))) >> 16), 3);
2981
2982 return result;
2983 }
2984 else
2985 {
2986 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2987 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
2988 auto pmulhuw = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
2989 pmulhuw->addArg(x.value());
2990 pmulhuw->addArg(y.value());
2991 ::basicBlock->appendInst(pmulhuw);
2992
2993 return RValue<UShort4>(V(result));
2994 }
2995 }
2996
MulHigh(RValue<Int4> x,RValue<Int4> y)2997 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2998 {
2999 RR_DEBUG_INFO_UPDATE_LOC();
3000 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
3001
3002 // Scalarized implementation.
3003 Int4 result;
3004 result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
3005 result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
3006 result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
3007 result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
3008
3009 return result;
3010 }
3011
MulHigh(RValue<UInt4> x,RValue<UInt4> y)3012 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
3013 {
3014 RR_DEBUG_INFO_UPDATE_LOC();
3015 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
3016
3017 if(false) // Partial product based implementation.
3018 {
3019 auto xh = x >> 16;
3020 auto yh = y >> 16;
3021 auto xl = x & UInt4(0x0000FFFF);
3022 auto yl = y & UInt4(0x0000FFFF);
3023 auto xlyh = xl * yh;
3024 auto xhyl = xh * yl;
3025 auto xlyhh = xlyh >> 16;
3026 auto xhylh = xhyl >> 16;
3027 auto xlyhl = xlyh & UInt4(0x0000FFFF);
3028 auto xhyll = xhyl & UInt4(0x0000FFFF);
3029 auto xlylh = (xl * yl) >> 16;
3030 auto oflow = (xlyhl + xhyll + xlylh) >> 16;
3031
3032 return (xh * yh) + (xlyhh + xhylh) + oflow;
3033 }
3034
3035 // Scalarized implementation.
3036 Int4 result;
3037 result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
3038 result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
3039 result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
3040 result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
3041
3042 return As<UInt4>(result);
3043 }
3044
Average(RValue<UShort4> x,RValue<UShort4> y)3045 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
3046 {
3047 RR_DEBUG_INFO_UPDATE_LOC();
3048 UNIMPLEMENTED_NO_BUG("RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)");
3049 return UShort4(0);
3050 }
3051
type()3052 Type *UShort4::type()
3053 {
3054 return T(Type_v4i16);
3055 }
3056
Extract(RValue<Short8> val,int i)3057 RValue<Short> Extract(RValue<Short8> val, int i)
3058 {
3059 RR_DEBUG_INFO_UPDATE_LOC();
3060 return RValue<Short>(Nucleus::createExtractElement(val.value(), Short::type(), i));
3061 }
3062
Insert(RValue<Short8> val,RValue<Short> element,int i)3063 RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
3064 {
3065 RR_DEBUG_INFO_UPDATE_LOC();
3066 return RValue<Short8>(Nucleus::createInsertElement(val.value(), element.value(), i));
3067 }
3068
operator <<(RValue<Short8> lhs,unsigned char rhs)3069 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
3070 {
3071 RR_DEBUG_INFO_UPDATE_LOC();
3072 if(emulateIntrinsics)
3073 {
3074 Short8 result;
3075 result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
3076 result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
3077 result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
3078 result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
3079 result = Insert(result, Extract(lhs, 4) << Short(rhs), 4);
3080 result = Insert(result, Extract(lhs, 5) << Short(rhs), 5);
3081 result = Insert(result, Extract(lhs, 6) << Short(rhs), 6);
3082 result = Insert(result, Extract(lhs, 7) << Short(rhs), 7);
3083
3084 return result;
3085 }
3086 else
3087 {
3088 return RValue<Short8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3089 }
3090 }
3091
operator >>(RValue<Short8> lhs,unsigned char rhs)3092 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
3093 {
3094 RR_DEBUG_INFO_UPDATE_LOC();
3095 if(emulateIntrinsics)
3096 {
3097 Short8 result;
3098 result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
3099 result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
3100 result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
3101 result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
3102 result = Insert(result, Extract(lhs, 4) >> Short(rhs), 4);
3103 result = Insert(result, Extract(lhs, 5) >> Short(rhs), 5);
3104 result = Insert(result, Extract(lhs, 6) >> Short(rhs), 6);
3105 result = Insert(result, Extract(lhs, 7) >> Short(rhs), 7);
3106
3107 return result;
3108 }
3109 else
3110 {
3111 return RValue<Short8>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3112 }
3113 }
3114
MulAdd(RValue<Short8> x,RValue<Short8> y)3115 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
3116 {
3117 RR_DEBUG_INFO_UPDATE_LOC();
3118 UNIMPLEMENTED_NO_BUG("RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)");
3119 return Int4(0);
3120 }
3121
MulHigh(RValue<Short8> x,RValue<Short8> y)3122 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
3123 {
3124 RR_DEBUG_INFO_UPDATE_LOC();
3125 UNIMPLEMENTED_NO_BUG("RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)");
3126 return Short8(0);
3127 }
3128
type()3129 Type *Short8::type()
3130 {
3131 return T(Ice::IceType_v8i16);
3132 }
3133
Extract(RValue<UShort8> val,int i)3134 RValue<UShort> Extract(RValue<UShort8> val, int i)
3135 {
3136 RR_DEBUG_INFO_UPDATE_LOC();
3137 return RValue<UShort>(Nucleus::createExtractElement(val.value(), UShort::type(), i));
3138 }
3139
Insert(RValue<UShort8> val,RValue<UShort> element,int i)3140 RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
3141 {
3142 RR_DEBUG_INFO_UPDATE_LOC();
3143 return RValue<UShort8>(Nucleus::createInsertElement(val.value(), element.value(), i));
3144 }
3145
operator <<(RValue<UShort8> lhs,unsigned char rhs)3146 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
3147 {
3148 RR_DEBUG_INFO_UPDATE_LOC();
3149 if(emulateIntrinsics)
3150 {
3151 UShort8 result;
3152 result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
3153 result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
3154 result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
3155 result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
3156 result = Insert(result, Extract(lhs, 4) << UShort(rhs), 4);
3157 result = Insert(result, Extract(lhs, 5) << UShort(rhs), 5);
3158 result = Insert(result, Extract(lhs, 6) << UShort(rhs), 6);
3159 result = Insert(result, Extract(lhs, 7) << UShort(rhs), 7);
3160
3161 return result;
3162 }
3163 else
3164 {
3165 return RValue<UShort8>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3166 }
3167 }
3168
operator >>(RValue<UShort8> lhs,unsigned char rhs)3169 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
3170 {
3171 RR_DEBUG_INFO_UPDATE_LOC();
3172 if(emulateIntrinsics)
3173 {
3174 UShort8 result;
3175 result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
3176 result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
3177 result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
3178 result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
3179 result = Insert(result, Extract(lhs, 4) >> UShort(rhs), 4);
3180 result = Insert(result, Extract(lhs, 5) >> UShort(rhs), 5);
3181 result = Insert(result, Extract(lhs, 6) >> UShort(rhs), 6);
3182 result = Insert(result, Extract(lhs, 7) >> UShort(rhs), 7);
3183
3184 return result;
3185 }
3186 else
3187 {
3188 return RValue<UShort8>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3189 }
3190 }
3191
MulHigh(RValue<UShort8> x,RValue<UShort8> y)3192 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
3193 {
3194 RR_DEBUG_INFO_UPDATE_LOC();
3195 UNIMPLEMENTED_NO_BUG("RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)");
3196 return UShort8(0);
3197 }
3198
type()3199 Type *UShort8::type()
3200 {
3201 return T(Ice::IceType_v8i16);
3202 }
3203
operator ++(Int & val,int)3204 RValue<Int> operator++(Int &val, int) // Post-increment
3205 {
3206 RR_DEBUG_INFO_UPDATE_LOC();
3207 RValue<Int> res = val;
3208 val += 1;
3209 return res;
3210 }
3211
operator ++(Int & val)3212 const Int &operator++(Int &val) // Pre-increment
3213 {
3214 RR_DEBUG_INFO_UPDATE_LOC();
3215 val += 1;
3216 return val;
3217 }
3218
operator --(Int & val,int)3219 RValue<Int> operator--(Int &val, int) // Post-decrement
3220 {
3221 RR_DEBUG_INFO_UPDATE_LOC();
3222 RValue<Int> res = val;
3223 val -= 1;
3224 return res;
3225 }
3226
operator --(Int & val)3227 const Int &operator--(Int &val) // Pre-decrement
3228 {
3229 RR_DEBUG_INFO_UPDATE_LOC();
3230 val -= 1;
3231 return val;
3232 }
3233
RoundInt(RValue<Float> cast)3234 RValue<Int> RoundInt(RValue<Float> cast)
3235 {
3236 RR_DEBUG_INFO_UPDATE_LOC();
3237 if(emulateIntrinsics || CPUID::ARM)
3238 {
3239 // Push the fractional part off the mantissa. Accurate up to +/-2^22.
3240 return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
3241 }
3242 else
3243 {
3244 Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
3245 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3246 auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3247 nearbyint->addArg(cast.value());
3248 ::basicBlock->appendInst(nearbyint);
3249
3250 return RValue<Int>(V(result));
3251 }
3252 }
3253
type()3254 Type *Int::type()
3255 {
3256 return T(Ice::IceType_i32);
3257 }
3258
type()3259 Type *Long::type()
3260 {
3261 return T(Ice::IceType_i64);
3262 }
3263
UInt(RValue<Float> cast)3264 UInt::UInt(RValue<Float> cast)
3265 {
3266 RR_DEBUG_INFO_UPDATE_LOC();
3267 // Smallest positive value representable in UInt, but not in Int
3268 const unsigned int ustart = 0x80000000u;
3269 const float ustartf = float(ustart);
3270
3271 // If the value is negative, store 0, otherwise store the result of the conversion
3272 storeValue((~(As<Int>(cast) >> 31) &
3273 // Check if the value can be represented as an Int
3274 IfThenElse(cast >= ustartf,
3275 // If the value is too large, subtract ustart and re-add it after conversion.
3276 As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
3277 // Otherwise, just convert normally
3278 Int(cast)))
3279 .value());
3280 }
3281
operator ++(UInt & val,int)3282 RValue<UInt> operator++(UInt &val, int) // Post-increment
3283 {
3284 RR_DEBUG_INFO_UPDATE_LOC();
3285 RValue<UInt> res = val;
3286 val += 1;
3287 return res;
3288 }
3289
operator ++(UInt & val)3290 const UInt &operator++(UInt &val) // Pre-increment
3291 {
3292 RR_DEBUG_INFO_UPDATE_LOC();
3293 val += 1;
3294 return val;
3295 }
3296
operator --(UInt & val,int)3297 RValue<UInt> operator--(UInt &val, int) // Post-decrement
3298 {
3299 RR_DEBUG_INFO_UPDATE_LOC();
3300 RValue<UInt> res = val;
3301 val -= 1;
3302 return res;
3303 }
3304
operator --(UInt & val)3305 const UInt &operator--(UInt &val) // Pre-decrement
3306 {
3307 RR_DEBUG_INFO_UPDATE_LOC();
3308 val -= 1;
3309 return val;
3310 }
3311
3312 // RValue<UInt> RoundUInt(RValue<Float> cast)
3313 // {
3314 // ASSERT(false && "UNIMPLEMENTED"); return RValue<UInt>(V(nullptr));
3315 // }
3316
type()3317 Type *UInt::type()
3318 {
3319 return T(Ice::IceType_i32);
3320 }
3321
3322 // Int2::Int2(RValue<Int> cast)
3323 // {
3324 // Value *extend = Nucleus::createZExt(cast.value(), Long::type());
3325 // Value *vector = Nucleus::createBitCast(extend, Int2::type());
3326 //
3327 // Constant *shuffle[2];
3328 // shuffle[0] = Nucleus::createConstantInt(0);
3329 // shuffle[1] = Nucleus::createConstantInt(0);
3330 //
3331 // Value *replicate = Nucleus::createShuffleVector(vector, UndefValue::get(Int2::type()), Nucleus::createConstantVector(shuffle, 2));
3332 //
3333 // storeValue(replicate);
3334 // }
3335
operator <<(RValue<Int2> lhs,unsigned char rhs)3336 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
3337 {
3338 RR_DEBUG_INFO_UPDATE_LOC();
3339 if(emulateIntrinsics)
3340 {
3341 Int2 result;
3342 result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
3343 result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
3344
3345 return result;
3346 }
3347 else
3348 {
3349 return RValue<Int2>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3350 }
3351 }
3352
operator >>(RValue<Int2> lhs,unsigned char rhs)3353 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
3354 {
3355 RR_DEBUG_INFO_UPDATE_LOC();
3356 if(emulateIntrinsics)
3357 {
3358 Int2 result;
3359 result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
3360 result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
3361
3362 return result;
3363 }
3364 else
3365 {
3366 return RValue<Int2>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3367 }
3368 }
3369
type()3370 Type *Int2::type()
3371 {
3372 return T(Type_v2i32);
3373 }
3374
operator <<(RValue<UInt2> lhs,unsigned char rhs)3375 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
3376 {
3377 RR_DEBUG_INFO_UPDATE_LOC();
3378 if(emulateIntrinsics)
3379 {
3380 UInt2 result;
3381 result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
3382 result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
3383
3384 return result;
3385 }
3386 else
3387 {
3388 return RValue<UInt2>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3389 }
3390 }
3391
operator >>(RValue<UInt2> lhs,unsigned char rhs)3392 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
3393 {
3394 RR_DEBUG_INFO_UPDATE_LOC();
3395 if(emulateIntrinsics)
3396 {
3397 UInt2 result;
3398 result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
3399 result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
3400
3401 return result;
3402 }
3403 else
3404 {
3405 return RValue<UInt2>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3406 }
3407 }
3408
type()3409 Type *UInt2::type()
3410 {
3411 return T(Type_v2i32);
3412 }
3413
Int4(RValue<Byte4> cast)3414 Int4::Int4(RValue<Byte4> cast)
3415 : XYZW(this)
3416 {
3417 RR_DEBUG_INFO_UPDATE_LOC();
3418 Value *x = Nucleus::createBitCast(cast.value(), Int::type());
3419 Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
3420
3421 Value *e;
3422 int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
3423 Value *b = Nucleus::createBitCast(a, Byte16::type());
3424 Value *c = Nucleus::createShuffleVector(b, Nucleus::createNullValue(Byte16::type()), swizzle);
3425
3426 int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
3427 Value *d = Nucleus::createBitCast(c, Short8::type());
3428 e = Nucleus::createShuffleVector(d, Nucleus::createNullValue(Short8::type()), swizzle2);
3429
3430 Value *f = Nucleus::createBitCast(e, Int4::type());
3431 storeValue(f);
3432 }
3433
Int4(RValue<SByte4> cast)3434 Int4::Int4(RValue<SByte4> cast)
3435 : XYZW(this)
3436 {
3437 RR_DEBUG_INFO_UPDATE_LOC();
3438 Value *x = Nucleus::createBitCast(cast.value(), Int::type());
3439 Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
3440
3441 int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
3442 Value *b = Nucleus::createBitCast(a, Byte16::type());
3443 Value *c = Nucleus::createShuffleVector(b, b, swizzle);
3444
3445 int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
3446 Value *d = Nucleus::createBitCast(c, Short8::type());
3447 Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
3448
3449 *this = As<Int4>(e) >> 24;
3450 }
3451
Int4(RValue<Short4> cast)3452 Int4::Int4(RValue<Short4> cast)
3453 : XYZW(this)
3454 {
3455 RR_DEBUG_INFO_UPDATE_LOC();
3456 int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
3457 Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
3458
3459 *this = As<Int4>(c) >> 16;
3460 }
3461
Int4(RValue<UShort4> cast)3462 Int4::Int4(RValue<UShort4> cast)
3463 : XYZW(this)
3464 {
3465 RR_DEBUG_INFO_UPDATE_LOC();
3466 int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
3467 Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
3468 Value *d = Nucleus::createBitCast(c, Int4::type());
3469 storeValue(d);
3470 }
3471
Int4(RValue<Int> rhs)3472 Int4::Int4(RValue<Int> rhs)
3473 : XYZW(this)
3474 {
3475 RR_DEBUG_INFO_UPDATE_LOC();
3476 Value *vector = Nucleus::createBitCast(rhs.value(), Int4::type());
3477
3478 int swizzle[4] = { 0, 0, 0, 0 };
3479 Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3480
3481 storeValue(replicate);
3482 }
3483
operator <<(RValue<Int4> lhs,unsigned char rhs)3484 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
3485 {
3486 RR_DEBUG_INFO_UPDATE_LOC();
3487 if(emulateIntrinsics)
3488 {
3489 Int4 result;
3490 result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
3491 result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
3492 result = Insert(result, Extract(lhs, 2) << Int(rhs), 2);
3493 result = Insert(result, Extract(lhs, 3) << Int(rhs), 3);
3494
3495 return result;
3496 }
3497 else
3498 {
3499 return RValue<Int4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3500 }
3501 }
3502
operator >>(RValue<Int4> lhs,unsigned char rhs)3503 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
3504 {
3505 RR_DEBUG_INFO_UPDATE_LOC();
3506 if(emulateIntrinsics)
3507 {
3508 Int4 result;
3509 result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
3510 result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
3511 result = Insert(result, Extract(lhs, 2) >> Int(rhs), 2);
3512 result = Insert(result, Extract(lhs, 3) >> Int(rhs), 3);
3513
3514 return result;
3515 }
3516 else
3517 {
3518 return RValue<Int4>(Nucleus::createAShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3519 }
3520 }
3521
CmpEQ(RValue<Int4> x,RValue<Int4> y)3522 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
3523 {
3524 RR_DEBUG_INFO_UPDATE_LOC();
3525 return RValue<Int4>(Nucleus::createICmpEQ(x.value(), y.value()));
3526 }
3527
CmpLT(RValue<Int4> x,RValue<Int4> y)3528 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
3529 {
3530 RR_DEBUG_INFO_UPDATE_LOC();
3531 return RValue<Int4>(Nucleus::createICmpSLT(x.value(), y.value()));
3532 }
3533
CmpLE(RValue<Int4> x,RValue<Int4> y)3534 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
3535 {
3536 RR_DEBUG_INFO_UPDATE_LOC();
3537 return RValue<Int4>(Nucleus::createICmpSLE(x.value(), y.value()));
3538 }
3539
CmpNEQ(RValue<Int4> x,RValue<Int4> y)3540 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
3541 {
3542 RR_DEBUG_INFO_UPDATE_LOC();
3543 return RValue<Int4>(Nucleus::createICmpNE(x.value(), y.value()));
3544 }
3545
CmpNLT(RValue<Int4> x,RValue<Int4> y)3546 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
3547 {
3548 RR_DEBUG_INFO_UPDATE_LOC();
3549 return RValue<Int4>(Nucleus::createICmpSGE(x.value(), y.value()));
3550 }
3551
CmpNLE(RValue<Int4> x,RValue<Int4> y)3552 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
3553 {
3554 RR_DEBUG_INFO_UPDATE_LOC();
3555 return RValue<Int4>(Nucleus::createICmpSGT(x.value(), y.value()));
3556 }
3557
Max(RValue<Int4> x,RValue<Int4> y)3558 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
3559 {
3560 RR_DEBUG_INFO_UPDATE_LOC();
3561 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3562 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value(), y.value());
3563 ::basicBlock->appendInst(cmp);
3564
3565 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3566 auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3567 ::basicBlock->appendInst(select);
3568
3569 return RValue<Int4>(V(result));
3570 }
3571
Min(RValue<Int4> x,RValue<Int4> y)3572 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
3573 {
3574 RR_DEBUG_INFO_UPDATE_LOC();
3575 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3576 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value(), y.value());
3577 ::basicBlock->appendInst(cmp);
3578
3579 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3580 auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3581 ::basicBlock->appendInst(select);
3582
3583 return RValue<Int4>(V(result));
3584 }
3585
RoundInt(RValue<Float4> cast)3586 RValue<Int4> RoundInt(RValue<Float4> cast)
3587 {
3588 RR_DEBUG_INFO_UPDATE_LOC();
3589 if(emulateIntrinsics || CPUID::ARM)
3590 {
3591 // Push the fractional part off the mantissa. Accurate up to +/-2^22.
3592 return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
3593 }
3594 else
3595 {
3596 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3597 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3598 auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3599 nearbyint->addArg(cast.value());
3600 ::basicBlock->appendInst(nearbyint);
3601
3602 return RValue<Int4>(V(result));
3603 }
3604 }
3605
RoundIntClamped(RValue<Float4> cast)3606 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
3607 {
3608 RR_DEBUG_INFO_UPDATE_LOC();
3609
3610 // cvtps2dq produces 0x80000000, a negative value, for input larger than
3611 // 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
3612 // saturate to 0x80000000.
3613 RValue<Float4> clamped = Min(cast, Float4(0x7FFFFF80));
3614
3615 if(emulateIntrinsics || CPUID::ARM)
3616 {
3617 // Push the fractional part off the mantissa. Accurate up to +/-2^22.
3618 return Int4((clamped + Float4(0x00C00000)) - Float4(0x00C00000));
3619 }
3620 else
3621 {
3622 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3623 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3624 auto nearbyint = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3625 nearbyint->addArg(clamped.value());
3626 ::basicBlock->appendInst(nearbyint);
3627
3628 return RValue<Int4>(V(result));
3629 }
3630 }
3631
PackSigned(RValue<Int4> x,RValue<Int4> y)3632 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
3633 {
3634 RR_DEBUG_INFO_UPDATE_LOC();
3635 if(emulateIntrinsics)
3636 {
3637 Short8 result;
3638 result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
3639 result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
3640 result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
3641 result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
3642 result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
3643 result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
3644 result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
3645 result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
3646
3647 return result;
3648 }
3649 else
3650 {
3651 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
3652 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3653 auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
3654 pack->addArg(x.value());
3655 pack->addArg(y.value());
3656 ::basicBlock->appendInst(pack);
3657
3658 return RValue<Short8>(V(result));
3659 }
3660 }
3661
PackUnsigned(RValue<Int4> x,RValue<Int4> y)3662 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
3663 {
3664 RR_DEBUG_INFO_UPDATE_LOC();
3665 if(emulateIntrinsics || !(CPUID::SSE4_1 || CPUID::ARM))
3666 {
3667 RValue<Int4> sx = As<Int4>(x);
3668 RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
3669
3670 RValue<Int4> sy = As<Int4>(y);
3671 RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
3672
3673 return As<UShort8>(PackSigned(bx, by) + Short8(0x8000u));
3674 }
3675 else
3676 {
3677 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
3678 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3679 auto pack = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
3680 pack->addArg(x.value());
3681 pack->addArg(y.value());
3682 ::basicBlock->appendInst(pack);
3683
3684 return RValue<UShort8>(V(result));
3685 }
3686 }
3687
SignMask(RValue<Int4> x)3688 RValue<Int> SignMask(RValue<Int4> x)
3689 {
3690 RR_DEBUG_INFO_UPDATE_LOC();
3691 if(emulateIntrinsics || CPUID::ARM)
3692 {
3693 Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
3694 return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
3695 }
3696 else
3697 {
3698 Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
3699 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3700 auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3701 movmsk->addArg(x.value());
3702 ::basicBlock->appendInst(movmsk);
3703
3704 return RValue<Int>(V(result));
3705 }
3706 }
3707
type()3708 Type *Int4::type()
3709 {
3710 return T(Ice::IceType_v4i32);
3711 }
3712
UInt4(RValue<Float4> cast)3713 UInt4::UInt4(RValue<Float4> cast)
3714 : XYZW(this)
3715 {
3716 RR_DEBUG_INFO_UPDATE_LOC();
3717 // Smallest positive value representable in UInt, but not in Int
3718 const unsigned int ustart = 0x80000000u;
3719 const float ustartf = float(ustart);
3720
3721 // Check if the value can be represented as an Int
3722 Int4 uiValue = CmpNLT(cast, Float4(ustartf));
3723 // If the value is too large, subtract ustart and re-add it after conversion.
3724 uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
3725 // Otherwise, just convert normally
3726 (~uiValue & Int4(cast));
3727 // If the value is negative, store 0, otherwise store the result of the conversion
3728 storeValue((~(As<Int4>(cast) >> 31) & uiValue).value());
3729 }
3730
UInt4(RValue<UInt> rhs)3731 UInt4::UInt4(RValue<UInt> rhs)
3732 : XYZW(this)
3733 {
3734 RR_DEBUG_INFO_UPDATE_LOC();
3735 Value *vector = Nucleus::createBitCast(rhs.value(), UInt4::type());
3736
3737 int swizzle[4] = { 0, 0, 0, 0 };
3738 Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3739
3740 storeValue(replicate);
3741 }
3742
operator <<(RValue<UInt4> lhs,unsigned char rhs)3743 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
3744 {
3745 RR_DEBUG_INFO_UPDATE_LOC();
3746 if(emulateIntrinsics)
3747 {
3748 UInt4 result;
3749 result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
3750 result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
3751 result = Insert(result, Extract(lhs, 2) << UInt(rhs), 2);
3752 result = Insert(result, Extract(lhs, 3) << UInt(rhs), 3);
3753
3754 return result;
3755 }
3756 else
3757 {
3758 return RValue<UInt4>(Nucleus::createShl(lhs.value(), V(::context->getConstantInt32(rhs))));
3759 }
3760 }
3761
operator >>(RValue<UInt4> lhs,unsigned char rhs)3762 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
3763 {
3764 RR_DEBUG_INFO_UPDATE_LOC();
3765 if(emulateIntrinsics)
3766 {
3767 UInt4 result;
3768 result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
3769 result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
3770 result = Insert(result, Extract(lhs, 2) >> UInt(rhs), 2);
3771 result = Insert(result, Extract(lhs, 3) >> UInt(rhs), 3);
3772
3773 return result;
3774 }
3775 else
3776 {
3777 return RValue<UInt4>(Nucleus::createLShr(lhs.value(), V(::context->getConstantInt32(rhs))));
3778 }
3779 }
3780
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)3781 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
3782 {
3783 RR_DEBUG_INFO_UPDATE_LOC();
3784 return RValue<UInt4>(Nucleus::createICmpEQ(x.value(), y.value()));
3785 }
3786
CmpLT(RValue<UInt4> x,RValue<UInt4> y)3787 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
3788 {
3789 RR_DEBUG_INFO_UPDATE_LOC();
3790 return RValue<UInt4>(Nucleus::createICmpULT(x.value(), y.value()));
3791 }
3792
CmpLE(RValue<UInt4> x,RValue<UInt4> y)3793 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
3794 {
3795 RR_DEBUG_INFO_UPDATE_LOC();
3796 return RValue<UInt4>(Nucleus::createICmpULE(x.value(), y.value()));
3797 }
3798
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)3799 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
3800 {
3801 RR_DEBUG_INFO_UPDATE_LOC();
3802 return RValue<UInt4>(Nucleus::createICmpNE(x.value(), y.value()));
3803 }
3804
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)3805 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
3806 {
3807 RR_DEBUG_INFO_UPDATE_LOC();
3808 return RValue<UInt4>(Nucleus::createICmpUGE(x.value(), y.value()));
3809 }
3810
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)3811 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
3812 {
3813 RR_DEBUG_INFO_UPDATE_LOC();
3814 return RValue<UInt4>(Nucleus::createICmpUGT(x.value(), y.value()));
3815 }
3816
Max(RValue<UInt4> x,RValue<UInt4> y)3817 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
3818 {
3819 RR_DEBUG_INFO_UPDATE_LOC();
3820 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3821 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value(), y.value());
3822 ::basicBlock->appendInst(cmp);
3823
3824 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3825 auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3826 ::basicBlock->appendInst(select);
3827
3828 return RValue<UInt4>(V(result));
3829 }
3830
Min(RValue<UInt4> x,RValue<UInt4> y)3831 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
3832 {
3833 RR_DEBUG_INFO_UPDATE_LOC();
3834 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3835 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value(), y.value());
3836 ::basicBlock->appendInst(cmp);
3837
3838 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3839 auto select = Ice::InstSelect::create(::function, result, condition, y.value(), x.value());
3840 ::basicBlock->appendInst(select);
3841
3842 return RValue<UInt4>(V(result));
3843 }
3844
type()3845 Type *UInt4::type()
3846 {
3847 return T(Ice::IceType_v4i32);
3848 }
3849
type()3850 Type *Half::type()
3851 {
3852 return T(Ice::IceType_i16);
3853 }
3854
Rcp_pp(RValue<Float> x,bool exactAtPow2)3855 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
3856 {
3857 RR_DEBUG_INFO_UPDATE_LOC();
3858 return 1.0f / x;
3859 }
3860
RcpSqrt_pp(RValue<Float> x)3861 RValue<Float> RcpSqrt_pp(RValue<Float> x)
3862 {
3863 RR_DEBUG_INFO_UPDATE_LOC();
3864 return Rcp_pp(Sqrt(x));
3865 }
3866
Sqrt(RValue<Float> x)3867 RValue<Float> Sqrt(RValue<Float> x)
3868 {
3869 RR_DEBUG_INFO_UPDATE_LOC();
3870 Ice::Variable *result = ::function->makeVariable(Ice::IceType_f32);
3871 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
3872 auto sqrt = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
3873 sqrt->addArg(x.value());
3874 ::basicBlock->appendInst(sqrt);
3875
3876 return RValue<Float>(V(result));
3877 }
3878
Round(RValue<Float> x)3879 RValue<Float> Round(RValue<Float> x)
3880 {
3881 RR_DEBUG_INFO_UPDATE_LOC();
3882 return Float4(Round(Float4(x))).x;
3883 }
3884
Trunc(RValue<Float> x)3885 RValue<Float> Trunc(RValue<Float> x)
3886 {
3887 RR_DEBUG_INFO_UPDATE_LOC();
3888 return Float4(Trunc(Float4(x))).x;
3889 }
3890
Frac(RValue<Float> x)3891 RValue<Float> Frac(RValue<Float> x)
3892 {
3893 RR_DEBUG_INFO_UPDATE_LOC();
3894 return Float4(Frac(Float4(x))).x;
3895 }
3896
Floor(RValue<Float> x)3897 RValue<Float> Floor(RValue<Float> x)
3898 {
3899 RR_DEBUG_INFO_UPDATE_LOC();
3900 return Float4(Floor(Float4(x))).x;
3901 }
3902
Ceil(RValue<Float> x)3903 RValue<Float> Ceil(RValue<Float> x)
3904 {
3905 RR_DEBUG_INFO_UPDATE_LOC();
3906 return Float4(Ceil(Float4(x))).x;
3907 }
3908
type()3909 Type *Float::type()
3910 {
3911 return T(Ice::IceType_f32);
3912 }
3913
type()3914 Type *Float2::type()
3915 {
3916 return T(Type_v2f32);
3917 }
3918
Float4(RValue<Float> rhs)3919 Float4::Float4(RValue<Float> rhs)
3920 : XYZW(this)
3921 {
3922 RR_DEBUG_INFO_UPDATE_LOC();
3923 Value *vector = Nucleus::createBitCast(rhs.value(), Float4::type());
3924
3925 int swizzle[4] = { 0, 0, 0, 0 };
3926 Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3927
3928 storeValue(replicate);
3929 }
3930
Max(RValue<Float4> x,RValue<Float4> y)3931 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3932 {
3933 RR_DEBUG_INFO_UPDATE_LOC();
3934 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3935 auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value(), y.value());
3936 ::basicBlock->appendInst(cmp);
3937
3938 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3939 auto select = Ice::InstSelect::create(::function, result, condition, x.value(), y.value());
3940 ::basicBlock->appendInst(select);
3941
3942 return RValue<Float4>(V(result));
3943 }
3944
Min(RValue<Float4> x,RValue<Float4> y)3945 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3946 {
3947 RR_DEBUG_INFO_UPDATE_LOC();
3948 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3949 auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value(), y.value());
3950 ::basicBlock->appendInst(cmp);
3951
3952 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3953 auto select = Ice::InstSelect::create(::function, result, condition, x.value(), y.value());
3954 ::basicBlock->appendInst(select);
3955
3956 return RValue<Float4>(V(result));
3957 }
3958
Rcp_pp(RValue<Float4> x,bool exactAtPow2)3959 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3960 {
3961 RR_DEBUG_INFO_UPDATE_LOC();
3962 return Float4(1.0f) / x;
3963 }
3964
RcpSqrt_pp(RValue<Float4> x)3965 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3966 {
3967 RR_DEBUG_INFO_UPDATE_LOC();
3968 return Rcp_pp(Sqrt(x));
3969 }
3970
HasRcpApprox()3971 bool HasRcpApprox()
3972 {
3973 // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3974 return false;
3975 }
3976
RcpApprox(RValue<Float4> x,bool exactAtPow2)3977 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
3978 {
3979 // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3980 UNREACHABLE("RValue<Float4> RcpApprox()");
3981 return { 0.0f };
3982 }
3983
RcpApprox(RValue<Float> x,bool exactAtPow2)3984 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
3985 {
3986 // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3987 UNREACHABLE("RValue<Float> RcpApprox()");
3988 return { 0.0f };
3989 }
3990
HasRcpSqrtApprox()3991 bool HasRcpSqrtApprox()
3992 {
3993 return false;
3994 }
3995
RcpSqrtApprox(RValue<Float4> x)3996 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
3997 {
3998 // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
3999 UNREACHABLE("RValue<Float4> RcpSqrtApprox()");
4000 return { 0.0f };
4001 }
4002
RcpSqrtApprox(RValue<Float> x)4003 RValue<Float> RcpSqrtApprox(RValue<Float> x)
4004 {
4005 // TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
4006 UNREACHABLE("RValue<Float> RcpSqrtApprox()");
4007 return { 0.0f };
4008 }
4009
Sqrt(RValue<Float4> x)4010 RValue<Float4> Sqrt(RValue<Float4> x)
4011 {
4012 RR_DEBUG_INFO_UPDATE_LOC();
4013 if(emulateIntrinsics || CPUID::ARM)
4014 {
4015 Float4 result;
4016 result.x = Sqrt(Float(Float4(x).x));
4017 result.y = Sqrt(Float(Float4(x).y));
4018 result.z = Sqrt(Float(Float4(x).z));
4019 result.w = Sqrt(Float(Float4(x).w));
4020
4021 return result;
4022 }
4023 else
4024 {
4025 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4026 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4027 auto sqrt = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4028 sqrt->addArg(x.value());
4029 ::basicBlock->appendInst(sqrt);
4030
4031 return RValue<Float4>(V(result));
4032 }
4033 }
4034
SignMask(RValue<Float4> x)4035 RValue<Int> SignMask(RValue<Float4> x)
4036 {
4037 RR_DEBUG_INFO_UPDATE_LOC();
4038 if(emulateIntrinsics || CPUID::ARM)
4039 {
4040 Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
4041 return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
4042 }
4043 else
4044 {
4045 Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
4046 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4047 auto movmsk = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4048 movmsk->addArg(x.value());
4049 ::basicBlock->appendInst(movmsk);
4050
4051 return RValue<Int>(V(result));
4052 }
4053 }
4054
CmpEQ(RValue<Float4> x,RValue<Float4> y)4055 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
4056 {
4057 RR_DEBUG_INFO_UPDATE_LOC();
4058 return RValue<Int4>(Nucleus::createFCmpOEQ(x.value(), y.value()));
4059 }
4060
CmpLT(RValue<Float4> x,RValue<Float4> y)4061 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
4062 {
4063 RR_DEBUG_INFO_UPDATE_LOC();
4064 return RValue<Int4>(Nucleus::createFCmpOLT(x.value(), y.value()));
4065 }
4066
CmpLE(RValue<Float4> x,RValue<Float4> y)4067 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
4068 {
4069 RR_DEBUG_INFO_UPDATE_LOC();
4070 return RValue<Int4>(Nucleus::createFCmpOLE(x.value(), y.value()));
4071 }
4072
CmpNEQ(RValue<Float4> x,RValue<Float4> y)4073 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
4074 {
4075 RR_DEBUG_INFO_UPDATE_LOC();
4076 return RValue<Int4>(Nucleus::createFCmpONE(x.value(), y.value()));
4077 }
4078
CmpNLT(RValue<Float4> x,RValue<Float4> y)4079 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
4080 {
4081 RR_DEBUG_INFO_UPDATE_LOC();
4082 return RValue<Int4>(Nucleus::createFCmpOGE(x.value(), y.value()));
4083 }
4084
CmpNLE(RValue<Float4> x,RValue<Float4> y)4085 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
4086 {
4087 RR_DEBUG_INFO_UPDATE_LOC();
4088 return RValue<Int4>(Nucleus::createFCmpOGT(x.value(), y.value()));
4089 }
4090
CmpUEQ(RValue<Float4> x,RValue<Float4> y)4091 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
4092 {
4093 RR_DEBUG_INFO_UPDATE_LOC();
4094 return RValue<Int4>(Nucleus::createFCmpUEQ(x.value(), y.value()));
4095 }
4096
CmpULT(RValue<Float4> x,RValue<Float4> y)4097 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
4098 {
4099 RR_DEBUG_INFO_UPDATE_LOC();
4100 return RValue<Int4>(Nucleus::createFCmpULT(x.value(), y.value()));
4101 }
4102
CmpULE(RValue<Float4> x,RValue<Float4> y)4103 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
4104 {
4105 RR_DEBUG_INFO_UPDATE_LOC();
4106 return RValue<Int4>(Nucleus::createFCmpULE(x.value(), y.value()));
4107 }
4108
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)4109 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
4110 {
4111 RR_DEBUG_INFO_UPDATE_LOC();
4112 return RValue<Int4>(Nucleus::createFCmpUNE(x.value(), y.value()));
4113 }
4114
CmpUNLT(RValue<Float4> x,RValue<Float4> y)4115 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
4116 {
4117 RR_DEBUG_INFO_UPDATE_LOC();
4118 return RValue<Int4>(Nucleus::createFCmpUGE(x.value(), y.value()));
4119 }
4120
CmpUNLE(RValue<Float4> x,RValue<Float4> y)4121 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
4122 {
4123 RR_DEBUG_INFO_UPDATE_LOC();
4124 return RValue<Int4>(Nucleus::createFCmpUGT(x.value(), y.value()));
4125 }
4126
Round(RValue<Float4> x)4127 RValue<Float4> Round(RValue<Float4> x)
4128 {
4129 RR_DEBUG_INFO_UPDATE_LOC();
4130 if(emulateIntrinsics || CPUID::ARM)
4131 {
4132 // Push the fractional part off the mantissa. Accurate up to +/-2^22.
4133 return (x + Float4(0x00C00000)) - Float4(0x00C00000);
4134 }
4135 else if(CPUID::SSE4_1)
4136 {
4137 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4138 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4139 auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4140 round->addArg(x.value());
4141 round->addArg(::context->getConstantInt32(0));
4142 ::basicBlock->appendInst(round);
4143
4144 return RValue<Float4>(V(result));
4145 }
4146 else
4147 {
4148 return Float4(RoundInt(x));
4149 }
4150 }
4151
Trunc(RValue<Float4> x)4152 RValue<Float4> Trunc(RValue<Float4> x)
4153 {
4154 RR_DEBUG_INFO_UPDATE_LOC();
4155 if(CPUID::SSE4_1)
4156 {
4157 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4158 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4159 auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4160 round->addArg(x.value());
4161 round->addArg(::context->getConstantInt32(3));
4162 ::basicBlock->appendInst(round);
4163
4164 return RValue<Float4>(V(result));
4165 }
4166 else
4167 {
4168 return Float4(Int4(x));
4169 }
4170 }
4171
Frac(RValue<Float4> x)4172 RValue<Float4> Frac(RValue<Float4> x)
4173 {
4174 RR_DEBUG_INFO_UPDATE_LOC();
4175 Float4 frc;
4176
4177 if(CPUID::SSE4_1)
4178 {
4179 frc = x - Floor(x);
4180 }
4181 else
4182 {
4183 frc = x - Float4(Int4(x)); // Signed fractional part.
4184
4185 frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1))); // Add 1.0 if negative.
4186 }
4187
4188 // x - floor(x) can be 1.0 for very small negative x.
4189 // Clamp against the value just below 1.0.
4190 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
4191 }
4192
Floor(RValue<Float4> x)4193 RValue<Float4> Floor(RValue<Float4> x)
4194 {
4195 RR_DEBUG_INFO_UPDATE_LOC();
4196 if(CPUID::SSE4_1)
4197 {
4198 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4199 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4200 auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4201 round->addArg(x.value());
4202 round->addArg(::context->getConstantInt32(1));
4203 ::basicBlock->appendInst(round);
4204
4205 return RValue<Float4>(V(result));
4206 }
4207 else
4208 {
4209 return x - Frac(x);
4210 }
4211 }
4212
Ceil(RValue<Float4> x)4213 RValue<Float4> Ceil(RValue<Float4> x)
4214 {
4215 RR_DEBUG_INFO_UPDATE_LOC();
4216 if(CPUID::SSE4_1)
4217 {
4218 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
4219 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4220 auto round = Ice::InstIntrinsic::create(::function, 2, result, intrinsic);
4221 round->addArg(x.value());
4222 round->addArg(::context->getConstantInt32(2));
4223 ::basicBlock->appendInst(round);
4224
4225 return RValue<Float4>(V(result));
4226 }
4227 else
4228 {
4229 return -Floor(-x);
4230 }
4231 }
4232
type()4233 Type *Float4::type()
4234 {
4235 return T(Ice::IceType_v4f32);
4236 }
4237
Ticks()4238 RValue<Long> Ticks()
4239 {
4240 RR_DEBUG_INFO_UPDATE_LOC();
4241 UNIMPLEMENTED_NO_BUG("RValue<Long> Ticks()");
4242 return Long(Int(0));
4243 }
4244
ConstantPointer(void const * ptr)4245 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
4246 {
4247 RR_DEBUG_INFO_UPDATE_LOC();
4248 return RValue<Pointer<Byte>>{ V(sz::getConstantPointer(::context, ptr)) };
4249 }
4250
ConstantData(void const * data,size_t size)4251 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
4252 {
4253 RR_DEBUG_INFO_UPDATE_LOC();
4254 return RValue<Pointer<Byte>>{ V(IceConstantData(data, size)) };
4255 }
4256
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)4257 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
4258 {
4259 RR_DEBUG_INFO_UPDATE_LOC();
4260 return V(sz::Call(::function, ::basicBlock, T(retTy), V(fptr.value()), V(args), false));
4261 }
4262
Breakpoint()4263 void Breakpoint()
4264 {
4265 RR_DEBUG_INFO_UPDATE_LOC();
4266 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Trap, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4267 auto trap = Ice::InstIntrinsic::create(::function, 0, nullptr, intrinsic);
4268 ::basicBlock->appendInst(trap);
4269 }
4270
createFence(std::memory_order memoryOrder)4271 void Nucleus::createFence(std::memory_order memoryOrder)
4272 {
4273 RR_DEBUG_INFO_UPDATE_LOC();
4274 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::AtomicFence, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4275 auto inst = Ice::InstIntrinsic::create(::function, 0, nullptr, intrinsic);
4276 auto order = ::context->getConstantInt32(stdToIceMemoryOrder(memoryOrder));
4277 inst->addArg(order);
4278 ::basicBlock->appendInst(inst);
4279 }
4280
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)4281 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
4282 {
4283 RR_DEBUG_INFO_UPDATE_LOC();
4284 UNIMPLEMENTED_NO_BUG("Subzero createMaskedLoad()");
4285 return nullptr;
4286 }
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)4287 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
4288 {
4289 RR_DEBUG_INFO_UPDATE_LOC();
4290 UNIMPLEMENTED_NO_BUG("Subzero createMaskedStore()");
4291 }
4292
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)4293 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
4294 {
4295 RR_DEBUG_INFO_UPDATE_LOC();
4296 return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
4297 }
4298
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)4299 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
4300 {
4301 RR_DEBUG_INFO_UPDATE_LOC();
4302 return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
4303 }
4304
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)4305 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
4306 {
4307 RR_DEBUG_INFO_UPDATE_LOC();
4308 return emulated::Scatter(base, val, offsets, mask, alignment);
4309 }
4310
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)4311 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
4312 {
4313 RR_DEBUG_INFO_UPDATE_LOC();
4314 return emulated::Scatter(base, val, offsets, mask, alignment);
4315 }
4316
Exp2(RValue<Float> x)4317 RValue<Float> Exp2(RValue<Float> x)
4318 {
4319 RR_DEBUG_INFO_UPDATE_LOC();
4320 return emulated::Exp2(x);
4321 }
4322
Log2(RValue<Float> x)4323 RValue<Float> Log2(RValue<Float> x)
4324 {
4325 RR_DEBUG_INFO_UPDATE_LOC();
4326 return emulated::Log2(x);
4327 }
4328
Sin(RValue<Float4> x)4329 RValue<Float4> Sin(RValue<Float4> x)
4330 {
4331 RR_DEBUG_INFO_UPDATE_LOC();
4332 return optimal::Sin(x);
4333 }
4334
Cos(RValue<Float4> x)4335 RValue<Float4> Cos(RValue<Float4> x)
4336 {
4337 RR_DEBUG_INFO_UPDATE_LOC();
4338 return optimal::Cos(x);
4339 }
4340
Tan(RValue<Float4> x)4341 RValue<Float4> Tan(RValue<Float4> x)
4342 {
4343 RR_DEBUG_INFO_UPDATE_LOC();
4344 return optimal::Tan(x);
4345 }
4346
Asin(RValue<Float4> x,Precision p)4347 RValue<Float4> Asin(RValue<Float4> x, Precision p)
4348 {
4349 RR_DEBUG_INFO_UPDATE_LOC();
4350 if(p == Precision::Full)
4351 {
4352 return emulated::Asin(x);
4353 }
4354 return optimal::Asin_8_terms(x);
4355 }
4356
Acos(RValue<Float4> x,Precision p)4357 RValue<Float4> Acos(RValue<Float4> x, Precision p)
4358 {
4359 RR_DEBUG_INFO_UPDATE_LOC();
4360 // Surprisingly, deqp-vk's precision.acos.highp/mediump tests pass when using the 4-term polynomial approximation
4361 // version of acos, unlike for Asin, which requires higher precision algorithms.
4362 return optimal::Acos_4_terms(x);
4363 }
4364
Atan(RValue<Float4> x)4365 RValue<Float4> Atan(RValue<Float4> x)
4366 {
4367 RR_DEBUG_INFO_UPDATE_LOC();
4368 return optimal::Atan(x);
4369 }
4370
Sinh(RValue<Float4> x)4371 RValue<Float4> Sinh(RValue<Float4> x)
4372 {
4373 RR_DEBUG_INFO_UPDATE_LOC();
4374 return optimal::Sinh(x);
4375 }
4376
Cosh(RValue<Float4> x)4377 RValue<Float4> Cosh(RValue<Float4> x)
4378 {
4379 RR_DEBUG_INFO_UPDATE_LOC();
4380 return optimal::Cosh(x);
4381 }
4382
Tanh(RValue<Float4> x)4383 RValue<Float4> Tanh(RValue<Float4> x)
4384 {
4385 RR_DEBUG_INFO_UPDATE_LOC();
4386 return optimal::Tanh(x);
4387 }
4388
Asinh(RValue<Float4> x)4389 RValue<Float4> Asinh(RValue<Float4> x)
4390 {
4391 RR_DEBUG_INFO_UPDATE_LOC();
4392 return optimal::Asinh(x);
4393 }
4394
Acosh(RValue<Float4> x)4395 RValue<Float4> Acosh(RValue<Float4> x)
4396 {
4397 RR_DEBUG_INFO_UPDATE_LOC();
4398 return optimal::Acosh(x);
4399 }
4400
Atanh(RValue<Float4> x)4401 RValue<Float4> Atanh(RValue<Float4> x)
4402 {
4403 RR_DEBUG_INFO_UPDATE_LOC();
4404 return optimal::Atanh(x);
4405 }
4406
Atan2(RValue<Float4> x,RValue<Float4> y)4407 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
4408 {
4409 RR_DEBUG_INFO_UPDATE_LOC();
4410 return optimal::Atan2(x, y);
4411 }
4412
Pow(RValue<Float4> x,RValue<Float4> y)4413 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
4414 {
4415 RR_DEBUG_INFO_UPDATE_LOC();
4416 return optimal::Pow(x, y);
4417 }
4418
Exp(RValue<Float4> x)4419 RValue<Float4> Exp(RValue<Float4> x)
4420 {
4421 RR_DEBUG_INFO_UPDATE_LOC();
4422 return optimal::Exp(x);
4423 }
4424
Log(RValue<Float4> x)4425 RValue<Float4> Log(RValue<Float4> x)
4426 {
4427 RR_DEBUG_INFO_UPDATE_LOC();
4428 return optimal::Log(x);
4429 }
4430
Exp2(RValue<Float4> x)4431 RValue<Float4> Exp2(RValue<Float4> x)
4432 {
4433 RR_DEBUG_INFO_UPDATE_LOC();
4434 return optimal::Exp2(x);
4435 }
4436
Log2(RValue<Float4> x)4437 RValue<Float4> Log2(RValue<Float4> x)
4438 {
4439 RR_DEBUG_INFO_UPDATE_LOC();
4440 return optimal::Log2(x);
4441 }
4442
Ctlz(RValue<UInt> x,bool isZeroUndef)4443 RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
4444 {
4445 RR_DEBUG_INFO_UPDATE_LOC();
4446 if(emulateIntrinsics)
4447 {
4448 UNIMPLEMENTED_NO_BUG("Subzero Ctlz()");
4449 return UInt(0);
4450 }
4451 else
4452 {
4453 Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
4454 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Ctlz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4455 auto ctlz = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4456 ctlz->addArg(x.value());
4457 ::basicBlock->appendInst(ctlz);
4458
4459 return RValue<UInt>(V(result));
4460 }
4461 }
4462
Ctlz(RValue<UInt4> x,bool isZeroUndef)4463 RValue<UInt4> Ctlz(RValue<UInt4> x, bool isZeroUndef)
4464 {
4465 RR_DEBUG_INFO_UPDATE_LOC();
4466 if(emulateIntrinsics)
4467 {
4468 UNIMPLEMENTED_NO_BUG("Subzero Ctlz()");
4469 return UInt4(0);
4470 }
4471 else
4472 {
4473 // TODO: implement vectorized version in Subzero
4474 UInt4 result;
4475 result = Insert(result, Ctlz(Extract(x, 0), isZeroUndef), 0);
4476 result = Insert(result, Ctlz(Extract(x, 1), isZeroUndef), 1);
4477 result = Insert(result, Ctlz(Extract(x, 2), isZeroUndef), 2);
4478 result = Insert(result, Ctlz(Extract(x, 3), isZeroUndef), 3);
4479 return result;
4480 }
4481 }
4482
Cttz(RValue<UInt> x,bool isZeroUndef)4483 RValue<UInt> Cttz(RValue<UInt> x, bool isZeroUndef)
4484 {
4485 RR_DEBUG_INFO_UPDATE_LOC();
4486 if(emulateIntrinsics)
4487 {
4488 UNIMPLEMENTED_NO_BUG("Subzero Cttz()");
4489 return UInt(0);
4490 }
4491 else
4492 {
4493 Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
4494 const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Cttz, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
4495 auto ctlz = Ice::InstIntrinsic::create(::function, 1, result, intrinsic);
4496 ctlz->addArg(x.value());
4497 ::basicBlock->appendInst(ctlz);
4498
4499 return RValue<UInt>(V(result));
4500 }
4501 }
4502
Cttz(RValue<UInt4> x,bool isZeroUndef)4503 RValue<UInt4> Cttz(RValue<UInt4> x, bool isZeroUndef)
4504 {
4505 RR_DEBUG_INFO_UPDATE_LOC();
4506 if(emulateIntrinsics)
4507 {
4508 UNIMPLEMENTED_NO_BUG("Subzero Cttz()");
4509 return UInt4(0);
4510 }
4511 else
4512 {
4513 // TODO: implement vectorized version in Subzero
4514 UInt4 result;
4515 result = Insert(result, Cttz(Extract(x, 0), isZeroUndef), 0);
4516 result = Insert(result, Cttz(Extract(x, 1), isZeroUndef), 1);
4517 result = Insert(result, Cttz(Extract(x, 2), isZeroUndef), 2);
4518 result = Insert(result, Cttz(Extract(x, 3), isZeroUndef), 3);
4519 return result;
4520 }
4521 }
4522
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)4523 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
4524 {
4525 RR_DEBUG_INFO_UPDATE_LOC();
4526 return emulated::MinAtomic(x, y, memoryOrder);
4527 }
4528
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)4529 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
4530 {
4531 RR_DEBUG_INFO_UPDATE_LOC();
4532 return emulated::MinAtomic(x, y, memoryOrder);
4533 }
4534
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)4535 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
4536 {
4537 RR_DEBUG_INFO_UPDATE_LOC();
4538 return emulated::MaxAtomic(x, y, memoryOrder);
4539 }
4540
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)4541 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
4542 {
4543 RR_DEBUG_INFO_UPDATE_LOC();
4544 return emulated::MaxAtomic(x, y, memoryOrder);
4545 }
4546
EmitDebugLocation()4547 void EmitDebugLocation()
4548 {
4549 #ifdef ENABLE_RR_DEBUG_INFO
4550 emitPrintLocation(getCallerBacktrace());
4551 #endif // ENABLE_RR_DEBUG_INFO
4552 }
EmitDebugVariable(Value * value)4553 void EmitDebugVariable(Value *value) {}
FlushDebug()4554 void FlushDebug() {}
4555
4556 namespace {
4557 namespace coro {
4558
4559 // Instance data per generated coroutine
4560 // This is the "handle" type used for Coroutine functions
4561 // Lifetime: from yield to when CoroutineEntryDestroy generated function is called.
4562 struct CoroutineData
4563 {
4564 bool useInternalScheduler = false;
4565 bool done = false; // the coroutine should stop at the next yield()
4566 bool terminated = false; // the coroutine has finished.
4567 bool inRoutine = false; // is the coroutine currently executing?
4568 marl::Scheduler::Fiber *mainFiber = nullptr;
4569 marl::Scheduler::Fiber *routineFiber = nullptr;
4570 void *promisePtr = nullptr;
4571 };
4572
createCoroutineData()4573 CoroutineData *createCoroutineData()
4574 {
4575 return new CoroutineData{};
4576 }
4577
destroyCoroutineData(CoroutineData * coroData)4578 void destroyCoroutineData(CoroutineData *coroData)
4579 {
4580 delete coroData;
4581 }
4582
4583 // suspend() pauses execution of the coroutine, and resumes execution from the
4584 // caller's call to await().
4585 // Returns true if await() is called again, or false if coroutine_destroy()
4586 // is called.
suspend(Nucleus::CoroutineHandle handle)4587 bool suspend(Nucleus::CoroutineHandle handle)
4588 {
4589 auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4590 ASSERT(marl::Scheduler::Fiber::current() == coroData->routineFiber);
4591 ASSERT(coroData->inRoutine);
4592 coroData->inRoutine = false;
4593 coroData->mainFiber->notify();
4594 while(!coroData->inRoutine)
4595 {
4596 coroData->routineFiber->wait();
4597 }
4598 return !coroData->done;
4599 }
4600
4601 // resume() is called by await(), blocking until the coroutine calls yield()
4602 // or the coroutine terminates.
resume(Nucleus::CoroutineHandle handle)4603 void resume(Nucleus::CoroutineHandle handle)
4604 {
4605 auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4606 ASSERT(marl::Scheduler::Fiber::current() == coroData->mainFiber);
4607 ASSERT(!coroData->inRoutine);
4608 coroData->inRoutine = true;
4609 coroData->routineFiber->notify();
4610 while(coroData->inRoutine)
4611 {
4612 coroData->mainFiber->wait();
4613 }
4614 }
4615
4616 // stop() is called by coroutine_destroy(), signalling that it's done, then blocks
4617 // until the coroutine ends, and deletes the coroutine data.
stop(Nucleus::CoroutineHandle handle)4618 void stop(Nucleus::CoroutineHandle handle)
4619 {
4620 auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4621 ASSERT(marl::Scheduler::Fiber::current() == coroData->mainFiber);
4622 ASSERT(!coroData->inRoutine);
4623 if(!coroData->terminated)
4624 {
4625 coroData->done = true;
4626 coroData->inRoutine = true;
4627 coroData->routineFiber->notify();
4628 while(!coroData->terminated)
4629 {
4630 coroData->mainFiber->wait();
4631 }
4632 }
4633 if(coroData->useInternalScheduler)
4634 {
4635 ::getOrCreateScheduler().unbind();
4636 }
4637 coro::destroyCoroutineData(coroData); // free the coroutine data.
4638 }
4639
4640 namespace detail {
4641 thread_local rr::Nucleus::CoroutineHandle coroHandle{};
4642 } // namespace detail
4643
setHandleParam(Nucleus::CoroutineHandle handle)4644 void setHandleParam(Nucleus::CoroutineHandle handle)
4645 {
4646 ASSERT(!detail::coroHandle);
4647 detail::coroHandle = handle;
4648 }
4649
getHandleParam()4650 Nucleus::CoroutineHandle getHandleParam()
4651 {
4652 ASSERT(detail::coroHandle);
4653 auto handle = detail::coroHandle;
4654 detail::coroHandle = {};
4655 return handle;
4656 }
4657
isDone(Nucleus::CoroutineHandle handle)4658 bool isDone(Nucleus::CoroutineHandle handle)
4659 {
4660 auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4661 return coroData->done;
4662 }
4663
setPromisePtr(Nucleus::CoroutineHandle handle,void * promisePtr)4664 void setPromisePtr(Nucleus::CoroutineHandle handle, void *promisePtr)
4665 {
4666 auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4667 coroData->promisePtr = promisePtr;
4668 }
4669
getPromisePtr(Nucleus::CoroutineHandle handle)4670 void *getPromisePtr(Nucleus::CoroutineHandle handle)
4671 {
4672 auto *coroData = reinterpret_cast<CoroutineData *>(handle);
4673 return coroData->promisePtr;
4674 }
4675
4676 } // namespace coro
4677 } // namespace
4678
4679 // Used to generate coroutines.
4680 // Lifetime: from yield to acquireCoroutine
4681 class CoroutineGenerator
4682 {
4683 public:
CoroutineGenerator()4684 CoroutineGenerator()
4685 {
4686 }
4687
4688 // Inserts instructions at the top of the current function to make it a coroutine.
generateCoroutineBegin()4689 void generateCoroutineBegin()
4690 {
4691 // Begin building the main coroutine_begin() function.
4692 // We insert these instructions at the top of the entry node,
4693 // before existing reactor-generated instructions.
4694
4695 // CoroutineHandle coroutine_begin(<Arguments>)
4696 // {
4697 // this->handle = coro::getHandleParam();
4698 //
4699 // YieldType promise;
4700 // coro::setPromisePtr(handle, &promise); // For await
4701 //
4702 // ... <REACTOR CODE> ...
4703 //
4704
4705 // this->handle = coro::getHandleParam();
4706 this->handle = sz::Call(::function, ::entryBlock, coro::getHandleParam);
4707
4708 // YieldType promise;
4709 // coro::setPromisePtr(handle, &promise); // For await
4710 this->promise = sz::allocateStackVariable(::function, T(::coroYieldType));
4711 sz::Call(::function, ::entryBlock, coro::setPromisePtr, this->handle, this->promise);
4712 }
4713
4714 // Adds instructions for Yield() calls at the current location of the main coroutine function.
generateYield(Value * val)4715 void generateYield(Value *val)
4716 {
4717 // ... <REACTOR CODE> ...
4718 //
4719 // promise = val;
4720 // if (!coro::suspend(handle)) {
4721 // return false; // coroutine has been stopped by the caller.
4722 // }
4723 //
4724 // ... <REACTOR CODE> ...
4725
4726 // promise = val;
4727 Nucleus::createStore(val, V(this->promise), ::coroYieldType);
4728
4729 // if (!coro::suspend(handle)) {
4730 auto result = sz::Call(::function, ::basicBlock, coro::suspend, this->handle);
4731 auto doneBlock = Nucleus::createBasicBlock();
4732 auto resumeBlock = Nucleus::createBasicBlock();
4733 Nucleus::createCondBr(V(result), resumeBlock, doneBlock);
4734
4735 // return false; // coroutine has been stopped by the caller.
4736 ::basicBlock = doneBlock;
4737 Nucleus::createRetVoid(); // coroutine return value is ignored.
4738
4739 // ... <REACTOR CODE> ...
4740 ::basicBlock = resumeBlock;
4741 }
4742
4743 using FunctionUniquePtr = std::unique_ptr<Ice::Cfg>;
4744
4745 // Generates the await function for the current coroutine.
4746 // Cannot use Nucleus functions that modify ::function and ::basicBlock.
generateAwaitFunction()4747 static FunctionUniquePtr generateAwaitFunction()
4748 {
4749 // bool coroutine_await(CoroutineHandle handle, YieldType* out)
4750 // {
4751 // if (coro::isDone())
4752 // {
4753 // return false;
4754 // }
4755 // else // resume
4756 // {
4757 // YieldType* promise = coro::getPromisePtr(handle);
4758 // *out = *promise;
4759 // coro::resume(handle);
4760 // return true;
4761 // }
4762 // }
4763
4764 // Subzero doesn't support bool types (IceType_i1) as return type
4765 const Ice::Type ReturnType = Ice::IceType_i32;
4766 const Ice::Type YieldPtrType = sz::getPointerType(T(::coroYieldType));
4767 const Ice::Type HandleType = sz::getPointerType(Ice::IceType_void);
4768
4769 Ice::Cfg *awaitFunc = sz::createFunction(::context, ReturnType, std::vector<Ice::Type>{ HandleType, YieldPtrType });
4770 Ice::CfgLocalAllocatorScope scopedAlloc{ awaitFunc };
4771
4772 Ice::Variable *handle = awaitFunc->getArgs()[0];
4773 Ice::Variable *outPtr = awaitFunc->getArgs()[1];
4774
4775 auto doneBlock = awaitFunc->makeNode();
4776 {
4777 // return false;
4778 Ice::InstRet *ret = Ice::InstRet::create(awaitFunc, ::context->getConstantInt32(0));
4779 doneBlock->appendInst(ret);
4780 }
4781
4782 auto resumeBlock = awaitFunc->makeNode();
4783 {
4784 // YieldType* promise = coro::getPromisePtr(handle);
4785 Ice::Variable *promise = sz::Call(awaitFunc, resumeBlock, coro::getPromisePtr, handle);
4786
4787 // *out = *promise;
4788 // Load promise value
4789 Ice::Variable *promiseVal = awaitFunc->makeVariable(T(::coroYieldType));
4790 auto load = Ice::InstLoad::create(awaitFunc, promiseVal, promise);
4791 resumeBlock->appendInst(load);
4792 // Then store it in output param
4793 auto store = Ice::InstStore::create(awaitFunc, promiseVal, outPtr);
4794 resumeBlock->appendInst(store);
4795
4796 // coro::resume(handle);
4797 sz::Call(awaitFunc, resumeBlock, coro::resume, handle);
4798
4799 // return true;
4800 Ice::InstRet *ret = Ice::InstRet::create(awaitFunc, ::context->getConstantInt32(1));
4801 resumeBlock->appendInst(ret);
4802 }
4803
4804 // if (coro::isDone())
4805 // {
4806 // <doneBlock>
4807 // }
4808 // else // resume
4809 // {
4810 // <resumeBlock>
4811 // }
4812 Ice::CfgNode *bb = awaitFunc->getEntryNode();
4813 Ice::Variable *done = sz::Call(awaitFunc, bb, coro::isDone, handle);
4814 auto br = Ice::InstBr::create(awaitFunc, done, doneBlock, resumeBlock);
4815 bb->appendInst(br);
4816
4817 return FunctionUniquePtr{ awaitFunc };
4818 }
4819
4820 // Generates the destroy function for the current coroutine.
4821 // Cannot use Nucleus functions that modify ::function and ::basicBlock.
generateDestroyFunction()4822 static FunctionUniquePtr generateDestroyFunction()
4823 {
4824 // void coroutine_destroy(Nucleus::CoroutineHandle handle)
4825 // {
4826 // coro::stop(handle); // signal and wait for coroutine to stop, and delete coroutine data
4827 // return;
4828 // }
4829
4830 const Ice::Type ReturnType = Ice::IceType_void;
4831 const Ice::Type HandleType = sz::getPointerType(Ice::IceType_void);
4832
4833 Ice::Cfg *destroyFunc = sz::createFunction(::context, ReturnType, std::vector<Ice::Type>{ HandleType });
4834 Ice::CfgLocalAllocatorScope scopedAlloc{ destroyFunc };
4835
4836 Ice::Variable *handle = destroyFunc->getArgs()[0];
4837
4838 auto *bb = destroyFunc->getEntryNode();
4839
4840 // coro::stop(handle); // signal and wait for coroutine to stop, and delete coroutine data
4841 sz::Call(destroyFunc, bb, coro::stop, handle);
4842
4843 // return;
4844 Ice::InstRet *ret = Ice::InstRet::create(destroyFunc);
4845 bb->appendInst(ret);
4846
4847 return FunctionUniquePtr{ destroyFunc };
4848 }
4849
4850 private:
4851 Ice::Variable *handle{};
4852 Ice::Variable *promise{};
4853 };
4854
invokeCoroutineBegin(std::function<Nucleus::CoroutineHandle ()> beginFunc)4855 static Nucleus::CoroutineHandle invokeCoroutineBegin(std::function<Nucleus::CoroutineHandle()> beginFunc)
4856 {
4857 // This doubles up as our coroutine handle
4858 auto coroData = coro::createCoroutineData();
4859
4860 coroData->useInternalScheduler = (marl::Scheduler::get() == nullptr);
4861 if(coroData->useInternalScheduler)
4862 {
4863 ::getOrCreateScheduler().bind();
4864 }
4865
4866 auto run = [=] {
4867 // Store handle in TLS so that the coroutine can grab it right away, before
4868 // any fiber switch occurs.
4869 coro::setHandleParam(coroData);
4870
4871 ASSERT(!coroData->routineFiber);
4872 coroData->routineFiber = marl::Scheduler::Fiber::current();
4873
4874 beginFunc();
4875
4876 ASSERT(coroData->inRoutine);
4877 coroData->done = true; // coroutine is done.
4878 coroData->terminated = true; // signal that the coroutine data is ready for freeing.
4879 coroData->inRoutine = false;
4880 coroData->mainFiber->notify();
4881 };
4882
4883 ASSERT(!coroData->mainFiber);
4884 coroData->mainFiber = marl::Scheduler::Fiber::current();
4885
4886 // block until the first yield or coroutine end
4887 ASSERT(!coroData->inRoutine);
4888 coroData->inRoutine = true;
4889 marl::schedule(marl::Task(run, marl::Task::Flags::SameThread));
4890 while(coroData->inRoutine)
4891 {
4892 coroData->mainFiber->wait();
4893 }
4894
4895 return coroData;
4896 }
4897
createCoroutine(Type * yieldType,const std::vector<Type * > & params)4898 void Nucleus::createCoroutine(Type *yieldType, const std::vector<Type *> ¶ms)
4899 {
4900 // Start by creating a regular function
4901 createFunction(yieldType, params);
4902
4903 // Save in case yield() is called
4904 ASSERT(::coroYieldType == nullptr); // Only one coroutine can be generated at once
4905 ::coroYieldType = yieldType;
4906 }
4907
yield(Value * val)4908 void Nucleus::yield(Value *val)
4909 {
4910 RR_DEBUG_INFO_UPDATE_LOC();
4911 Variable::materializeAll();
4912
4913 // On first yield, we start generating coroutine functions
4914 if(!::coroGen)
4915 {
4916 ::coroGen = std::make_shared<CoroutineGenerator>();
4917 ::coroGen->generateCoroutineBegin();
4918 }
4919
4920 ASSERT(::coroGen);
4921 ::coroGen->generateYield(val);
4922 }
4923
coroutineEntryAwaitStub(Nucleus::CoroutineHandle,void * yieldValue)4924 static bool coroutineEntryAwaitStub(Nucleus::CoroutineHandle, void *yieldValue)
4925 {
4926 return false;
4927 }
4928
coroutineEntryDestroyStub(Nucleus::CoroutineHandle handle)4929 static void coroutineEntryDestroyStub(Nucleus::CoroutineHandle handle)
4930 {
4931 }
4932
acquireCoroutine(const char * name,const Config::Edit & cfgEdit)4933 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
4934 {
4935 if(::coroGen)
4936 {
4937 // Finish generating coroutine functions
4938 {
4939 Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
4940 finalizeFunction();
4941 }
4942
4943 auto awaitFunc = ::coroGen->generateAwaitFunction();
4944 auto destroyFunc = ::coroGen->generateDestroyFunction();
4945
4946 // At this point, we no longer need the CoroutineGenerator.
4947 ::coroGen.reset();
4948 ::coroYieldType = nullptr;
4949
4950 auto routine = rr::acquireRoutine({ ::function, awaitFunc.get(), destroyFunc.get() },
4951 { name, "await", "destroy" },
4952 cfgEdit);
4953
4954 return routine;
4955 }
4956 else
4957 {
4958 {
4959 Ice::CfgLocalAllocatorScope scopedAlloc{ ::function };
4960 finalizeFunction();
4961 }
4962
4963 ::coroYieldType = nullptr;
4964
4965 // Not an actual coroutine (no yields), so return stubs for await and destroy
4966 auto routine = rr::acquireRoutine({ ::function }, { name }, cfgEdit);
4967
4968 auto routineImpl = std::static_pointer_cast<ELFMemoryStreamer>(routine);
4969 routineImpl->setEntry(Nucleus::CoroutineEntryAwait, reinterpret_cast<const void *>(&coroutineEntryAwaitStub));
4970 routineImpl->setEntry(Nucleus::CoroutineEntryDestroy, reinterpret_cast<const void *>(&coroutineEntryDestroyStub));
4971 return routine;
4972 }
4973 }
4974
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4975 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4976 {
4977 const bool isCoroutine = routine.getEntry(Nucleus::CoroutineEntryAwait) != reinterpret_cast<const void *>(&coroutineEntryAwaitStub);
4978
4979 if(isCoroutine)
4980 {
4981 return rr::invokeCoroutineBegin(func);
4982 }
4983 else
4984 {
4985 // For regular routines, just invoke the begin func directly
4986 return func();
4987 }
4988 }
4989
4990 } // namespace rr
4991