1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "LLVMReactor.hpp"
16 
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "EmulatedIntrinsics.hpp"
20 #include "LLVMReactorDebugInfo.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "x86.hpp"
24 
25 #include "llvm/IR/Intrinsics.h"
26 #include "llvm/IR/IntrinsicsX86.h"
27 #include "llvm/IR/LegacyPassManager.h"
28 #include "llvm/IR/Verifier.h"
29 #include "llvm/Support/Alignment.h"
30 #include "llvm/Support/ManagedStatic.h"
31 #include "llvm/Transforms/Coroutines.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/Scalar.h"
34 
35 #include <fstream>
36 #include <iostream>
37 #include <mutex>
38 #include <numeric>
39 #include <thread>
40 #include <unordered_map>
41 
42 #if defined(__i386__) || defined(__x86_64__)
43 #	include <xmmintrin.h>
44 #endif
45 
46 #include <math.h>
47 
48 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()49 extern "C" void X86CompilationCallback()
50 {
51 	UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
52 }
53 #endif
54 
55 #if !LLVM_ENABLE_THREADS
56 #	error "LLVM_ENABLE_THREADS needs to be enabled"
57 #endif
58 
59 #if LLVM_VERSION_MAJOR < 11
60 namespace llvm {
61 using FixedVectorType = VectorType;
62 }  // namespace llvm
63 #endif
64 
65 namespace {
66 
67 // Used to automatically invoke llvm_shutdown() when driver is unloaded
68 llvm::llvm_shutdown_obj llvmShutdownObj;
69 
70 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
71 // for destructing objects at exit. See crbug.com/1074222
72 thread_local rr::JITBuilder *jit = nullptr;
73 
74 // Default configuration settings. Must be accessed under mutex lock.
75 std::mutex defaultConfigLock;
defaultConfig()76 rr::Config &defaultConfig()
77 {
78 	// This uses a static in a function to avoid the cost of a global static
79 	// initializer. See http://neugierig.org/software/chromium/notes/2011/08/static-initializers.html
80 	static rr::Config config = rr::Config::Edit()
81 	                               .add(rr::Optimization::Pass::ScalarReplAggregates)
82 	                               .add(rr::Optimization::Pass::InstructionCombining)
83 	                               .apply({});
84 	return config;
85 }
86 
lowerPAVG(llvm::Value * x,llvm::Value * y)87 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
88 {
89 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
90 
91 	llvm::VectorType *extTy =
92 	    llvm::VectorType::getExtendedElementVectorType(ty);
93 	x = jit->builder->CreateZExt(x, extTy);
94 	y = jit->builder->CreateZExt(y, extTy);
95 
96 	// (x + y + 1) >> 1
97 	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
98 	llvm::Value *res = jit->builder->CreateAdd(x, y);
99 	res = jit->builder->CreateAdd(res, one);
100 	res = jit->builder->CreateLShr(res, one);
101 	return jit->builder->CreateTrunc(res, ty);
102 }
103 
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)104 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
105                           llvm::ICmpInst::Predicate pred)
106 {
107 	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
108 }
109 
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)110 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
111                        llvm::Value *y, llvm::Type *dstTy)
112 {
113 	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
114 }
115 
116 #if defined(__i386__) || defined(__x86_64__)
lowerPMOV(llvm::Value * op,llvm::Type * dstType,bool sext)117 llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
118 {
119 	llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
120 	llvm::FixedVectorType *dstTy = llvm::cast<llvm::FixedVectorType>(dstType);
121 
122 	llvm::Value *undef = llvm::UndefValue::get(srcTy);
123 	llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
124 	std::iota(mask.begin(), mask.end(), 0);
125 	llvm::Value *v = jit->builder->CreateShuffleVector(op, undef, mask);
126 
127 	return sext ? jit->builder->CreateSExt(v, dstTy)
128 	            : jit->builder->CreateZExt(v, dstTy);
129 }
130 
lowerPABS(llvm::Value * v)131 llvm::Value *lowerPABS(llvm::Value *v)
132 {
133 	llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
134 	llvm::Value *cmp = jit->builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
135 	llvm::Value *neg = jit->builder->CreateNeg(v);
136 	return jit->builder->CreateSelect(cmp, v, neg);
137 }
138 #endif  // defined(__i386__) || defined(__x86_64__)
139 
140 #if !defined(__i386__) && !defined(__x86_64__)
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)141 llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
142                            llvm::FCmpInst::Predicate pred)
143 {
144 	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
145 }
146 
lowerRound(llvm::Value * x)147 llvm::Value *lowerRound(llvm::Value *x)
148 {
149 	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
150 	    jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
151 	return jit->builder->CreateCall(nearbyint, { x });
152 }
153 
lowerRoundInt(llvm::Value * x,llvm::Type * ty)154 llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
155 {
156 	return jit->builder->CreateFPToSI(lowerRound(x), ty);
157 }
158 
lowerFloor(llvm::Value * x)159 llvm::Value *lowerFloor(llvm::Value *x)
160 {
161 	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
162 	    jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
163 	return jit->builder->CreateCall(floor, { x });
164 }
165 
lowerTrunc(llvm::Value * x)166 llvm::Value *lowerTrunc(llvm::Value *x)
167 {
168 	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
169 	    jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
170 	return jit->builder->CreateCall(trunc, { x });
171 }
172 
lowerSQRT(llvm::Value * x)173 llvm::Value *lowerSQRT(llvm::Value *x)
174 {
175 	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
176 	    jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
177 	return jit->builder->CreateCall(sqrt, { x });
178 }
179 
lowerRCP(llvm::Value * x)180 llvm::Value *lowerRCP(llvm::Value *x)
181 {
182 	llvm::Type *ty = x->getType();
183 	llvm::Constant *one;
184 	if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
185 	{
186 		one = llvm::ConstantVector::getSplat(
187 		    vectorTy->getNumElements(),
188 		    llvm::ConstantFP::get(vectorTy->getElementType(), 1));
189 	}
190 	else
191 	{
192 		one = llvm::ConstantFP::get(ty, 1);
193 	}
194 	return jit->builder->CreateFDiv(one, x);
195 }
196 
lowerRSQRT(llvm::Value * x)197 llvm::Value *lowerRSQRT(llvm::Value *x)
198 {
199 	return lowerRCP(lowerSQRT(x));
200 }
201 
lowerVectorShl(llvm::Value * x,uint64_t scalarY)202 llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
203 {
204 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
205 	llvm::Value *y = llvm::ConstantVector::getSplat(
206 	    ty->getNumElements(),
207 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
208 	return jit->builder->CreateShl(x, y);
209 }
210 
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)211 llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
212 {
213 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
214 	llvm::Value *y = llvm::ConstantVector::getSplat(
215 	    ty->getNumElements(),
216 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
217 	return jit->builder->CreateAShr(x, y);
218 }
219 
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)220 llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
221 {
222 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
223 	llvm::Value *y = llvm::ConstantVector::getSplat(
224 	    ty->getNumElements(),
225 	    llvm::ConstantInt::get(ty->getElementType(), scalarY));
226 	return jit->builder->CreateLShr(x, y);
227 }
228 
lowerMulAdd(llvm::Value * x,llvm::Value * y)229 llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
230 {
231 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
232 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
233 
234 	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
235 	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
236 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
237 
238 	llvm::Value *undef = llvm::UndefValue::get(extTy);
239 
240 	llvm::SmallVector<uint32_t, 16> evenIdx;
241 	llvm::SmallVector<uint32_t, 16> oddIdx;
242 	for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
243 	{
244 		evenIdx.push_back(i);
245 		oddIdx.push_back(i + 1);
246 	}
247 
248 	llvm::Value *lhs = jit->builder->CreateShuffleVector(mult, undef, evenIdx);
249 	llvm::Value *rhs = jit->builder->CreateShuffleVector(mult, undef, oddIdx);
250 	return jit->builder->CreateAdd(lhs, rhs);
251 }
252 
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)253 llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
254 {
255 	llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
256 	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
257 
258 	llvm::IntegerType *dstElemTy =
259 	    llvm::cast<llvm::IntegerType>(dstTy->getElementType());
260 
261 	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
262 	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
263 	llvm::Constant *max, *min;
264 	if(isSigned)
265 	{
266 		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
267 		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
268 	}
269 	else
270 	{
271 		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
272 		min = llvm::ConstantInt::get(srcTy, 0, false);
273 	}
274 
275 	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
276 	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
277 	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
278 	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
279 
280 	x = jit->builder->CreateTrunc(x, dstTy);
281 	y = jit->builder->CreateTrunc(y, dstTy);
282 
283 	llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
284 	std::iota(index.begin(), index.end(), 0);
285 
286 	return jit->builder->CreateShuffleVector(x, y, index);
287 }
288 
lowerSignMask(llvm::Value * x,llvm::Type * retTy)289 llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
290 {
291 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
292 	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
293 	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
294 
295 	llvm::Value *ret = jit->builder->CreateZExt(
296 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
297 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
298 	{
299 		llvm::Value *elem = jit->builder->CreateZExt(
300 		    jit->builder->CreateExtractElement(cmp, i), retTy);
301 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
302 	}
303 	return ret;
304 }
305 
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)306 llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
307 {
308 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
309 	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
310 	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
311 
312 	llvm::Value *ret = jit->builder->CreateZExt(
313 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
314 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
315 	{
316 		llvm::Value *elem = jit->builder->CreateZExt(
317 		    jit->builder->CreateExtractElement(cmp, i), retTy);
318 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
319 	}
320 	return ret;
321 }
322 #endif  // !defined(__i386__) && !defined(__x86_64__)
323 
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)324 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
325 {
326 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
327 }
328 
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)329 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
330 {
331 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
332 }
333 
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)334 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
335 {
336 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
337 }
338 
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)339 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
340 {
341 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
342 }
343 
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)344 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
345 {
346 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
347 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
348 
349 	llvm::Value *extX, *extY;
350 	if(sext)
351 	{
352 		extX = jit->builder->CreateSExt(x, extTy);
353 		extY = jit->builder->CreateSExt(y, extTy);
354 	}
355 	else
356 	{
357 		extX = jit->builder->CreateZExt(x, extTy);
358 		extY = jit->builder->CreateZExt(y, extTy);
359 	}
360 
361 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
362 
363 	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
364 	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
365 	return jit->builder->CreateTrunc(mulh, ty);
366 }
367 
368 }  // namespace
369 
370 namespace rr {
371 
BackendName()372 std::string BackendName()
373 {
374 	return std::string("LLVM ") + LLVM_VERSION_STRING;
375 }
376 
377 const Capabilities Caps = {
378 	true,  // CoroutinesSupported
379 };
380 
381 // The abstract Type* types are implemented as LLVM types, except that
382 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
383 // and VFP in ARM, and eliminate the overhead of converting them to explicit
384 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
385 // as abstract pointers with small enum values.
386 enum InternalType : uintptr_t
387 {
388 	// Emulated types:
389 	Type_v2i32,
390 	Type_v4i16,
391 	Type_v2i16,
392 	Type_v8i8,
393 	Type_v4i8,
394 	Type_v2f32,
395 	EmulatedTypeCount,
396 	// Returned by asInternalType() to indicate that the abstract Type*
397 	// should be interpreted as LLVM type pointer:
398 	Type_LLVM
399 };
400 
asInternalType(Type * type)401 inline InternalType asInternalType(Type *type)
402 {
403 	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
404 	return (t < EmulatedTypeCount) ? t : Type_LLVM;
405 }
406 
T(Type * t)407 llvm::Type *T(Type *t)
408 {
409 	// Use 128-bit vectors to implement logically shorter ones.
410 	switch(asInternalType(t))
411 	{
412 		case Type_v2i32: return T(Int4::type());
413 		case Type_v4i16: return T(Short8::type());
414 		case Type_v2i16: return T(Short8::type());
415 		case Type_v8i8: return T(Byte16::type());
416 		case Type_v4i8: return T(Byte16::type());
417 		case Type_v2f32: return T(Float4::type());
418 		case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
419 		default:
420 			UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
421 			return nullptr;
422 	}
423 }
424 
T(InternalType t)425 Type *T(InternalType t)
426 {
427 	return reinterpret_cast<Type *>(t);
428 }
429 
T(const std::vector<Type * > & t)430 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
431 {
432 	return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
433 }
434 
B(BasicBlock * t)435 inline llvm::BasicBlock *B(BasicBlock *t)
436 {
437 	return reinterpret_cast<llvm::BasicBlock *>(t);
438 }
439 
B(llvm::BasicBlock * t)440 inline BasicBlock *B(llvm::BasicBlock *t)
441 {
442 	return reinterpret_cast<BasicBlock *>(t);
443 }
444 
typeSize(Type * type)445 static size_t typeSize(Type *type)
446 {
447 	switch(asInternalType(type))
448 	{
449 		case Type_v2i32: return 8;
450 		case Type_v4i16: return 8;
451 		case Type_v2i16: return 4;
452 		case Type_v8i8: return 8;
453 		case Type_v4i8: return 4;
454 		case Type_v2f32: return 8;
455 		case Type_LLVM:
456 		{
457 			llvm::Type *t = T(type);
458 
459 			if(t->isPointerTy())
460 			{
461 				return sizeof(void *);
462 			}
463 
464 			// At this point we should only have LLVM 'primitive' types.
465 			unsigned int bits = t->getPrimitiveSizeInBits();
466 			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
467 
468 			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
469 			// but are typically stored as one byte. The DataLayout structure should
470 			// be used here and many other places if this assumption fails.
471 			return (bits + 7) / 8;
472 		}
473 		break;
474 		default:
475 			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
476 			return 0;
477 	}
478 }
479 
elementCount(Type * type)480 static unsigned int elementCount(Type *type)
481 {
482 	switch(asInternalType(type))
483 	{
484 		case Type_v2i32: return 2;
485 		case Type_v4i16: return 4;
486 		case Type_v2i16: return 2;
487 		case Type_v8i8: return 8;
488 		case Type_v4i8: return 4;
489 		case Type_v2f32: return 2;
490 		case Type_LLVM: return llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();
491 		default:
492 			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
493 			return 0;
494 	}
495 }
496 
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)497 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> &params)
498 {
499 	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
500 	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
501 
502 	func->setLinkage(llvm::GlobalValue::ExternalLinkage);
503 	func->setDoesNotThrow();
504 	func->setCallingConv(llvm::CallingConv::C);
505 
506 	if(__has_feature(memory_sanitizer))
507 	{
508 		func->addFnAttr(llvm::Attribute::SanitizeMemory);
509 	}
510 
511 	return func;
512 }
513 
Nucleus()514 Nucleus::Nucleus()
515 {
516 #if !__has_feature(memory_sanitizer)
517 	// thread_local variables in shared libraries are initialized at load-time,
518 	// but this is not observed by MemorySanitizer if the loader itself was not
519 	// instrumented, leading to false-positive unitialized variable errors.
520 	ASSERT(jit == nullptr);
521 	ASSERT(Variable::unmaterializedVariables == nullptr);
522 #endif
523 
524 	jit = new JITBuilder(Nucleus::getDefaultConfig());
525 	Variable::unmaterializedVariables = new Variable::UnmaterializedVariables{};
526 }
527 
~Nucleus()528 Nucleus::~Nucleus()
529 {
530 	delete Variable::unmaterializedVariables;
531 	Variable::unmaterializedVariables = nullptr;
532 
533 	delete jit;
534 	jit = nullptr;
535 }
536 
setDefaultConfig(const Config & cfg)537 void Nucleus::setDefaultConfig(const Config &cfg)
538 {
539 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
540 	::defaultConfig() = cfg;
541 }
542 
adjustDefaultConfig(const Config::Edit & cfgEdit)543 void Nucleus::adjustDefaultConfig(const Config::Edit &cfgEdit)
544 {
545 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
546 	auto &config = ::defaultConfig();
547 	config = cfgEdit.apply(config);
548 }
549 
getDefaultConfig()550 Config Nucleus::getDefaultConfig()
551 {
552 	std::unique_lock<std::mutex> lock(::defaultConfigLock);
553 	return ::defaultConfig();
554 }
555 
acquireRoutine(const char * name,const Config::Edit & cfgEdit)556 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
557 {
558 	if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
559 	{
560 		llvm::Type *type = jit->function->getReturnType();
561 
562 		if(type->isVoidTy())
563 		{
564 			createRetVoid();
565 		}
566 		else
567 		{
568 			createRet(V(llvm::UndefValue::get(type)));
569 		}
570 	}
571 
572 	std::shared_ptr<Routine> routine;
573 
574 	auto acquire = [&](rr::JITBuilder *jit) {
575 		// ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
576 		// it needs to only use the jit variable passed in as an argument.
577 
578 		auto cfg = cfgEdit.apply(jit->config);
579 
580 #ifdef ENABLE_RR_DEBUG_INFO
581 		if(jit->debugInfo != nullptr)
582 		{
583 			jit->debugInfo->Finalize();
584 		}
585 #endif  // ENABLE_RR_DEBUG_INFO
586 
587 		if(false)
588 		{
589 			std::error_code error;
590 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
591 			jit->module->print(file, 0);
592 		}
593 
594 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
595 		{
596 			llvm::legacy::PassManager pm;
597 			pm.add(llvm::createVerifierPass());
598 			pm.run(*jit->module);
599 		}
600 #endif  // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
601 
602 		jit->optimize(cfg);
603 
604 		if(false)
605 		{
606 			std::error_code error;
607 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
608 			jit->module->print(file, 0);
609 		}
610 
611 		routine = jit->acquireRoutine(name, &jit->function, 1, cfg);
612 	};
613 
614 #ifdef JIT_IN_SEPARATE_THREAD
615 	// Perform optimizations and codegen in a separate thread to avoid stack overflow.
616 	// FIXME(b/149829034): This is not a long-term solution. Reactor has no control
617 	// over the threading and stack sizes of its users, so this should be addressed
618 	// at a higher level instead.
619 	std::thread thread(acquire, jit);
620 	thread.join();
621 #else
622 	acquire(jit);
623 #endif
624 
625 	return routine;
626 }
627 
allocateStackVariable(Type * type,int arraySize)628 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
629 {
630 	// Need to allocate it in the entry block for mem2reg to work
631 	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
632 
633 	llvm::Instruction *declaration;
634 
635 #if LLVM_VERSION_MAJOR >= 11
636 	auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
637 #else
638 	auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
639 #endif
640 
641 	if(arraySize)
642 	{
643 		Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
644 		declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
645 	}
646 	else
647 	{
648 		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
649 	}
650 
651 	entryBlock.getInstList().push_front(declaration);
652 
653 	return V(declaration);
654 }
655 
createBasicBlock()656 BasicBlock *Nucleus::createBasicBlock()
657 {
658 	return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
659 }
660 
getInsertBlock()661 BasicBlock *Nucleus::getInsertBlock()
662 {
663 	return B(jit->builder->GetInsertBlock());
664 }
665 
setInsertBlock(BasicBlock * basicBlock)666 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
667 {
668 	// assert(jit->builder->GetInsertBlock()->back().isTerminator());
669 
670 	jit->builder->SetInsertPoint(B(basicBlock));
671 }
672 
createFunction(Type * ReturnType,const std::vector<Type * > & Params)673 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
674 {
675 	jit->function = rr::createFunction("", T(ReturnType), T(Params));
676 
677 #ifdef ENABLE_RR_DEBUG_INFO
678 	jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
679 #endif  // ENABLE_RR_DEBUG_INFO
680 
681 	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
682 }
683 
getArgument(unsigned int index)684 Value *Nucleus::getArgument(unsigned int index)
685 {
686 	llvm::Function::arg_iterator args = jit->function->arg_begin();
687 
688 	while(index)
689 	{
690 		args++;
691 		index--;
692 	}
693 
694 	return V(&*args);
695 }
696 
createRetVoid()697 void Nucleus::createRetVoid()
698 {
699 	RR_DEBUG_INFO_UPDATE_LOC();
700 
701 	ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
702 
703 	// Code generated after this point is unreachable, so any variables
704 	// being read can safely return an undefined value. We have to avoid
705 	// materializing variables after the terminator ret instruction.
706 	Variable::killUnmaterialized();
707 
708 	jit->builder->CreateRetVoid();
709 }
710 
createRet(Value * v)711 void Nucleus::createRet(Value *v)
712 {
713 	RR_DEBUG_INFO_UPDATE_LOC();
714 
715 	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
716 
717 	// Code generated after this point is unreachable, so any variables
718 	// being read can safely return an undefined value. We have to avoid
719 	// materializing variables after the terminator ret instruction.
720 	Variable::killUnmaterialized();
721 
722 	jit->builder->CreateRet(V(v));
723 }
724 
createBr(BasicBlock * dest)725 void Nucleus::createBr(BasicBlock *dest)
726 {
727 	RR_DEBUG_INFO_UPDATE_LOC();
728 	Variable::materializeAll();
729 
730 	jit->builder->CreateBr(B(dest));
731 }
732 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)733 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
734 {
735 	RR_DEBUG_INFO_UPDATE_LOC();
736 	Variable::materializeAll();
737 	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
738 }
739 
createAdd(Value * lhs,Value * rhs)740 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
741 {
742 	RR_DEBUG_INFO_UPDATE_LOC();
743 	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
744 }
745 
createSub(Value * lhs,Value * rhs)746 Value *Nucleus::createSub(Value *lhs, Value *rhs)
747 {
748 	RR_DEBUG_INFO_UPDATE_LOC();
749 	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
750 }
751 
createMul(Value * lhs,Value * rhs)752 Value *Nucleus::createMul(Value *lhs, Value *rhs)
753 {
754 	RR_DEBUG_INFO_UPDATE_LOC();
755 	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
756 }
757 
createUDiv(Value * lhs,Value * rhs)758 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
759 {
760 	RR_DEBUG_INFO_UPDATE_LOC();
761 	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
762 }
763 
createSDiv(Value * lhs,Value * rhs)764 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
765 {
766 	RR_DEBUG_INFO_UPDATE_LOC();
767 	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
768 }
769 
createFAdd(Value * lhs,Value * rhs)770 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
771 {
772 	RR_DEBUG_INFO_UPDATE_LOC();
773 	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
774 }
775 
createFSub(Value * lhs,Value * rhs)776 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
777 {
778 	RR_DEBUG_INFO_UPDATE_LOC();
779 	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
780 }
781 
createFMul(Value * lhs,Value * rhs)782 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
783 {
784 	RR_DEBUG_INFO_UPDATE_LOC();
785 	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
786 }
787 
createFDiv(Value * lhs,Value * rhs)788 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
789 {
790 	RR_DEBUG_INFO_UPDATE_LOC();
791 	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
792 }
793 
createURem(Value * lhs,Value * rhs)794 Value *Nucleus::createURem(Value *lhs, Value *rhs)
795 {
796 	RR_DEBUG_INFO_UPDATE_LOC();
797 	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
798 }
799 
createSRem(Value * lhs,Value * rhs)800 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
801 {
802 	RR_DEBUG_INFO_UPDATE_LOC();
803 	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
804 }
805 
createFRem(Value * lhs,Value * rhs)806 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
807 {
808 	RR_DEBUG_INFO_UPDATE_LOC();
809 	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
810 }
811 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)812 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
813 {
814 	return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
815 }
816 
createShl(Value * lhs,Value * rhs)817 Value *Nucleus::createShl(Value *lhs, Value *rhs)
818 {
819 	RR_DEBUG_INFO_UPDATE_LOC();
820 	return V(jit->builder->CreateShl(V(lhs), V(rhs)));
821 }
822 
createLShr(Value * lhs,Value * rhs)823 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
824 {
825 	RR_DEBUG_INFO_UPDATE_LOC();
826 	return V(jit->builder->CreateLShr(V(lhs), V(rhs)));
827 }
828 
createAShr(Value * lhs,Value * rhs)829 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
830 {
831 	RR_DEBUG_INFO_UPDATE_LOC();
832 	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
833 }
834 
createAnd(Value * lhs,Value * rhs)835 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
836 {
837 	RR_DEBUG_INFO_UPDATE_LOC();
838 	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
839 }
840 
createOr(Value * lhs,Value * rhs)841 Value *Nucleus::createOr(Value *lhs, Value *rhs)
842 {
843 	RR_DEBUG_INFO_UPDATE_LOC();
844 	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
845 }
846 
createXor(Value * lhs,Value * rhs)847 Value *Nucleus::createXor(Value *lhs, Value *rhs)
848 {
849 	RR_DEBUG_INFO_UPDATE_LOC();
850 	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
851 }
852 
createNeg(Value * v)853 Value *Nucleus::createNeg(Value *v)
854 {
855 	RR_DEBUG_INFO_UPDATE_LOC();
856 	return V(jit->builder->CreateNeg(V(v)));
857 }
858 
createFNeg(Value * v)859 Value *Nucleus::createFNeg(Value *v)
860 {
861 	RR_DEBUG_INFO_UPDATE_LOC();
862 	return V(jit->builder->CreateFNeg(V(v)));
863 }
864 
createNot(Value * v)865 Value *Nucleus::createNot(Value *v)
866 {
867 	RR_DEBUG_INFO_UPDATE_LOC();
868 	return V(jit->builder->CreateNot(V(v)));
869 }
870 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)871 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
872 {
873 	RR_DEBUG_INFO_UPDATE_LOC();
874 	switch(asInternalType(type))
875 	{
876 		case Type_v2i32:
877 		case Type_v4i16:
878 		case Type_v8i8:
879 		case Type_v2f32:
880 			return createBitCast(
881 			    createInsertElement(
882 			        V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
883 			        createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
884 			        0),
885 			    type);
886 		case Type_v2i16:
887 		case Type_v4i8:
888 			if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
889 			{
890 				Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
891 				Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
892 				i = createZExt(i, Long::type());
893 				Value *v = createInsertElement(u, i, 0);
894 				return createBitCast(v, type);
895 			}
896 			// Fallthrough to non-emulated case.
897 		case Type_LLVM:
898 		{
899 			auto elTy = T(type);
900 			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
901 
902 			if(!atomic)
903 			{
904 				return V(jit->builder->CreateAlignedLoad(V(ptr), llvm::MaybeAlign(alignment), isVolatile));
905 			}
906 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
907 			{
908 				// Integers and pointers can be atomically loaded by setting
909 				// the ordering constraint on the load instruction.
910 				auto load = jit->builder->CreateAlignedLoad(V(ptr), llvm::MaybeAlign(alignment), isVolatile);
911 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
912 				return V(load);
913 			}
914 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
915 			{
916 				// LLVM claims to support atomic loads of float types as
917 				// above, but certain backends cannot deal with this.
918 				// Load as an integer and bitcast. See b/136037244.
919 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
920 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
921 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
922 				auto load = jit->builder->CreateAlignedLoad(ptrCast, llvm::MaybeAlign(alignment), isVolatile);
923 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
924 				auto loadCast = jit->builder->CreateBitCast(load, elTy);
925 				return V(loadCast);
926 			}
927 			else
928 			{
929 				// More exotic types require falling back to the extern:
930 				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
931 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
932 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
933 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
934 				auto i8PtrTy = i8Ty->getPointerTo();
935 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
936 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
937 				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
938 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
939 				auto out = allocateStackVariable(type);
940 				jit->builder->CreateCall(func, {
941 				                                   llvm::ConstantInt::get(sizetTy, size),
942 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
943 				                                   jit->builder->CreatePointerCast(V(out), i8PtrTy),
944 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
945 				                               });
946 				return V(jit->builder->CreateLoad(V(out)));
947 			}
948 		}
949 		default:
950 			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
951 			return nullptr;
952 	}
953 }
954 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)955 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
956 {
957 	RR_DEBUG_INFO_UPDATE_LOC();
958 	switch(asInternalType(type))
959 	{
960 		case Type_v2i32:
961 		case Type_v4i16:
962 		case Type_v8i8:
963 		case Type_v2f32:
964 			createStore(
965 			    createExtractElement(
966 			        createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
967 			    createBitCast(ptr, Pointer<Long>::type()),
968 			    Long::type(), isVolatile, alignment, atomic, memoryOrder);
969 			return value;
970 		case Type_v2i16:
971 		case Type_v4i8:
972 			if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
973 			{
974 				createStore(
975 				    createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
976 				    createBitCast(ptr, Pointer<Int>::type()),
977 				    Int::type(), isVolatile, alignment, atomic, memoryOrder);
978 				return value;
979 			}
980 			// Fallthrough to non-emulated case.
981 		case Type_LLVM:
982 		{
983 			auto elTy = T(type);
984 			ASSERT(V(ptr)->getType()->getContainedType(0) == elTy);
985 
986 			if(__has_feature(memory_sanitizer) && !REACTOR_ENABLE_MEMORY_SANITIZER_INSTRUMENTATION)
987 			{
988 				// Mark all memory writes as initialized by calling __msan_unpoison
989 				// void __msan_unpoison(const volatile void *a, size_t size)
990 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
991 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
992 				auto voidPtrTy = i8Ty->getPointerTo();
993 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
994 				auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
995 				auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
996 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
997 
998 				jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
999 				                                 llvm::ConstantInt::get(sizetTy, size) });
1000 			}
1001 
1002 			if(!atomic)
1003 			{
1004 				jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1005 			}
1006 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
1007 			{
1008 				// Integers and pointers can be atomically stored by setting
1009 				// the ordering constraint on the store instruction.
1010 				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
1011 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1012 			}
1013 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
1014 			{
1015 				// LLVM claims to support atomic stores of float types as
1016 				// above, but certain backends cannot deal with this.
1017 				// Store as an bitcast integer. See b/136037244.
1018 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1019 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1020 				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1021 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1022 				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1023 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1024 			}
1025 			else
1026 			{
1027 				// More exotic types require falling back to the extern:
1028 				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1029 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1030 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1031 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1032 				auto i8PtrTy = i8Ty->getPointerTo();
1033 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
1034 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1035 				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1036 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1037 				auto copy = allocateStackVariable(type);
1038 				jit->builder->CreateStore(V(value), V(copy));
1039 				jit->builder->CreateCall(func, {
1040 				                                   llvm::ConstantInt::get(sizetTy, size),
1041 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1042 				                                   jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1043 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1044 				                               });
1045 			}
1046 
1047 			return value;
1048 		}
1049 		default:
1050 			UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1051 			return nullptr;
1052 	}
1053 }
1054 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1055 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1056 {
1057 	RR_DEBUG_INFO_UPDATE_LOC();
1058 
1059 	ASSERT(V(ptr)->getType()->isPointerTy());
1060 	ASSERT(V(mask)->getType()->isVectorTy());
1061 
1062 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1063 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1064 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1065 	auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1066 	auto elVecPtrTy = elVecTy->getPointerTo();
1067 	auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1068 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1069 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1070 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1071 	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1072 }
1073 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1074 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1075 {
1076 	RR_DEBUG_INFO_UPDATE_LOC();
1077 
1078 	ASSERT(V(ptr)->getType()->isPointerTy());
1079 	ASSERT(V(val)->getType()->isVectorTy());
1080 	ASSERT(V(mask)->getType()->isVectorTy());
1081 
1082 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1083 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1084 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1085 	auto elVecTy = V(val)->getType();
1086 	auto elVecPtrTy = elVecTy->getPointerTo();
1087 	auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1088 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1089 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1090 	jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1091 
1092 	if(__has_feature(memory_sanitizer) && !REACTOR_ENABLE_MEMORY_SANITIZER_INSTRUMENTATION)
1093 	{
1094 		// Mark memory writes as initialized by calling __msan_unpoison
1095 		// void __msan_unpoison(const volatile void *a, size_t size)
1096 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
1097 		auto voidPtrTy = voidTy->getPointerTo();
1098 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1099 		auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1100 		auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1101 		auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1102 
1103 		for(unsigned i = 0; i < numEls; i++)
1104 		{
1105 			// Check mask for this element
1106 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1107 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1108 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1109 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1110 			jit->builder->SetInsertPoint(thenBlock);
1111 
1112 			// Insert __msan_unpoison call in conditional block
1113 			auto elPtr = jit->builder->CreateGEP(V(ptr), idx);
1114 			jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1115 			                                 llvm::ConstantInt::get(sizetTy, size) });
1116 
1117 			jit->builder->CreateBr(mergeBlock);
1118 			jit->builder->SetInsertPoint(mergeBlock);
1119 		}
1120 	}
1121 }
1122 
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1123 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1124 {
1125 	ASSERT(base->getType()->isPointerTy());
1126 	ASSERT(offsets->getType()->isVectorTy());
1127 	ASSERT(mask->getType()->isVectorTy());
1128 
1129 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1130 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1131 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1132 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1133 	auto i8PtrTy = i8Ty->getPointerTo();
1134 	auto elPtrTy = elTy->getPointerTo();
1135 	auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1136 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1137 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1138 	auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
1139 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1140 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1141 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1142 
1143 	if(!__has_feature(memory_sanitizer))
1144 	{
1145 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1146 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1147 		return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1148 	}
1149 	else  // __has_feature(memory_sanitizer)
1150 	{
1151 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1152 		// Work around it by emulating gather with element-wise loads.
1153 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1154 
1155 		Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1156 		Nucleus::createStore(V(passthrough), result, T(elVecTy));
1157 
1158 		for(unsigned i = 0; i < numEls; i++)
1159 		{
1160 			// Check mask for this element
1161 			Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1162 
1163 			If(RValue<Bool>(elementMask))
1164 			{
1165 				Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1166 				Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1167 
1168 				Value *v = Nucleus::createLoad(result, T(elVecTy));
1169 				v = Nucleus::createInsertElement(v, el, i);
1170 				Nucleus::createStore(v, result, T(elVecTy));
1171 			}
1172 		}
1173 
1174 		return V(Nucleus::createLoad(result, T(elVecTy)));
1175 	}
1176 }
1177 
Gather(RValue<Pointer<Float>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1178 RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1179 {
1180 	return As<Float4>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1181 }
1182 
Gather(RValue<Pointer<Int>> base,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment,bool zeroMaskedLanes)1183 RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1184 {
1185 	return As<Int4>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1186 }
1187 
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1188 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1189 {
1190 	ASSERT(base->getType()->isPointerTy());
1191 	ASSERT(val->getType()->isVectorTy());
1192 	ASSERT(offsets->getType()->isVectorTy());
1193 	ASSERT(mask->getType()->isVectorTy());
1194 
1195 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1196 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1197 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1198 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1199 	auto i8PtrTy = i8Ty->getPointerTo();
1200 	auto elVecTy = val->getType();
1201 	auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1202 	auto elPtrTy = elTy->getPointerTo();
1203 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1204 
1205 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1206 	auto i8Ptrs = jit->builder->CreateGEP(i8Base, offsets);
1207 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1208 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1209 
1210 	if(!__has_feature(memory_sanitizer))
1211 	{
1212 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1213 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1214 		jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1215 	}
1216 	else  // __has_feature(memory_sanitizer)
1217 	{
1218 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1219 		// Work around it by emulating scatter with element-wise stores.
1220 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1221 
1222 		for(unsigned i = 0; i < numEls; i++)
1223 		{
1224 			// Check mask for this element
1225 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1226 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1227 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1228 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1229 			jit->builder->SetInsertPoint(thenBlock);
1230 
1231 			auto el = jit->builder->CreateExtractElement(val, idx);
1232 			auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1233 			Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1234 
1235 			jit->builder->CreateBr(mergeBlock);
1236 			jit->builder->SetInsertPoint(mergeBlock);
1237 		}
1238 	}
1239 }
1240 
Scatter(RValue<Pointer<Float>> base,RValue<Float4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1241 void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1242 {
1243 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1244 }
1245 
Scatter(RValue<Pointer<Int>> base,RValue<Int4> val,RValue<Int4> offsets,RValue<Int4> mask,unsigned int alignment)1246 void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
1247 {
1248 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1249 }
1250 
createFence(std::memory_order memoryOrder)1251 void Nucleus::createFence(std::memory_order memoryOrder)
1252 {
1253 	RR_DEBUG_INFO_UPDATE_LOC();
1254 	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1255 }
1256 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1257 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1258 {
1259 	RR_DEBUG_INFO_UPDATE_LOC();
1260 	ASSERT(V(ptr)->getType()->getContainedType(0) == T(type));
1261 	if(sizeof(void *) == 8)
1262 	{
1263 		// LLVM manual: "When indexing into an array, pointer or vector,
1264 		// integers of any width are allowed, and they are not required to
1265 		// be constant. These integers are treated as signed values where
1266 		// relevant."
1267 		//
1268 		// Thus if we want indexes to be treated as unsigned we have to
1269 		// zero-extend them ourselves.
1270 		//
1271 		// Note that this is not because we want to address anywhere near
1272 		// 4 GB of data. Instead this is important for performance because
1273 		// x86 supports automatic zero-extending of 32-bit registers to
1274 		// 64-bit. Thus when indexing into an array using a uint32 is
1275 		// actually faster than an int32.
1276 		index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1277 	}
1278 
1279 	// For non-emulated types we can rely on LLVM's GEP to calculate the
1280 	// effective address correctly.
1281 	if(asInternalType(type) == Type_LLVM)
1282 	{
1283 		return V(jit->builder->CreateGEP(V(ptr), V(index)));
1284 	}
1285 
1286 	// For emulated types we have to multiply the index by the intended
1287 	// type size ourselves to obain the byte offset.
1288 	index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1289 
1290 	// Cast to a byte pointer, apply the byte offset, and cast back to the
1291 	// original pointer type.
1292 	return createBitCast(
1293 	    V(jit->builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1294 	    T(llvm::PointerType::get(T(type), 0)));
1295 }
1296 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1297 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1298 {
1299 	RR_DEBUG_INFO_UPDATE_LOC();
1300 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1301 #if LLVM_VERSION_MAJOR >= 11
1302 	                                       llvm::MaybeAlign(),
1303 #endif
1304 	                                       atomicOrdering(true, memoryOrder)));
1305 }
1306 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1307 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1308 {
1309 	RR_DEBUG_INFO_UPDATE_LOC();
1310 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1311 #if LLVM_VERSION_MAJOR >= 11
1312 	                                       llvm::MaybeAlign(),
1313 #endif
1314 	                                       atomicOrdering(true, memoryOrder)));
1315 }
1316 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1317 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1318 {
1319 	RR_DEBUG_INFO_UPDATE_LOC();
1320 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1321 #if LLVM_VERSION_MAJOR >= 11
1322 	                                       llvm::MaybeAlign(),
1323 #endif
1324 	                                       atomicOrdering(true, memoryOrder)));
1325 }
1326 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1327 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1328 {
1329 	RR_DEBUG_INFO_UPDATE_LOC();
1330 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1331 #if LLVM_VERSION_MAJOR >= 11
1332 	                                       llvm::MaybeAlign(),
1333 #endif
1334 	                                       atomicOrdering(true, memoryOrder)));
1335 }
1336 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1337 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1338 {
1339 	RR_DEBUG_INFO_UPDATE_LOC();
1340 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1341 #if LLVM_VERSION_MAJOR >= 11
1342 	                                       llvm::MaybeAlign(),
1343 #endif
1344 	                                       atomicOrdering(true, memoryOrder)));
1345 }
1346 
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1347 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1348 {
1349 	RR_DEBUG_INFO_UPDATE_LOC();
1350 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1351 #if LLVM_VERSION_MAJOR >= 11
1352 	                                       llvm::MaybeAlign(),
1353 #endif
1354 	                                       atomicOrdering(true, memoryOrder)));
1355 }
1356 
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1357 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1358 {
1359 	RR_DEBUG_INFO_UPDATE_LOC();
1360 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1361 #if LLVM_VERSION_MAJOR >= 11
1362 	                                       llvm::MaybeAlign(),
1363 #endif
1364 	                                       atomicOrdering(true, memoryOrder)));
1365 }
1366 
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1367 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1368 {
1369 	RR_DEBUG_INFO_UPDATE_LOC();
1370 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1371 #if LLVM_VERSION_MAJOR >= 11
1372 	                                       llvm::MaybeAlign(),
1373 #endif
1374 	                                       atomicOrdering(true, memoryOrder)));
1375 }
1376 
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1377 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1378 {
1379 	RR_DEBUG_INFO_UPDATE_LOC();
1380 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1381 #if LLVM_VERSION_MAJOR >= 11
1382 	                                       llvm::MaybeAlign(),
1383 #endif
1384 	                                       atomicOrdering(true, memoryOrder)));
1385 }
1386 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1387 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1388 {
1389 	RR_DEBUG_INFO_UPDATE_LOC();
1390 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1391 #if LLVM_VERSION_MAJOR >= 11
1392 	                                       llvm::MaybeAlign(),
1393 #endif
1394 	                                       atomicOrdering(true, memoryOrder)));
1395 }
1396 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1397 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1398 {
1399 	RR_DEBUG_INFO_UPDATE_LOC();
1400 	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1401 	return V(jit->builder->CreateExtractValue(
1402 	    jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1403 #if LLVM_VERSION_MAJOR >= 11
1404 	                                      llvm::MaybeAlign(),
1405 #endif
1406 	                                      atomicOrdering(true, memoryOrderEqual),
1407 	                                      atomicOrdering(true, memoryOrderUnequal)),
1408 	    llvm::ArrayRef<unsigned>(0u)));
1409 }
1410 
createTrunc(Value * v,Type * destType)1411 Value *Nucleus::createTrunc(Value *v, Type *destType)
1412 {
1413 	RR_DEBUG_INFO_UPDATE_LOC();
1414 	return V(jit->builder->CreateTrunc(V(v), T(destType)));
1415 }
1416 
createZExt(Value * v,Type * destType)1417 Value *Nucleus::createZExt(Value *v, Type *destType)
1418 {
1419 	RR_DEBUG_INFO_UPDATE_LOC();
1420 	return V(jit->builder->CreateZExt(V(v), T(destType)));
1421 }
1422 
createSExt(Value * v,Type * destType)1423 Value *Nucleus::createSExt(Value *v, Type *destType)
1424 {
1425 	RR_DEBUG_INFO_UPDATE_LOC();
1426 	return V(jit->builder->CreateSExt(V(v), T(destType)));
1427 }
1428 
createFPToUI(Value * v,Type * destType)1429 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1430 {
1431 	RR_DEBUG_INFO_UPDATE_LOC();
1432 	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1433 }
1434 
createFPToSI(Value * v,Type * destType)1435 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1436 {
1437 	RR_DEBUG_INFO_UPDATE_LOC();
1438 	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1439 }
1440 
createSIToFP(Value * v,Type * destType)1441 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1442 {
1443 	RR_DEBUG_INFO_UPDATE_LOC();
1444 	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1445 }
1446 
createFPTrunc(Value * v,Type * destType)1447 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1448 {
1449 	RR_DEBUG_INFO_UPDATE_LOC();
1450 	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1451 }
1452 
createFPExt(Value * v,Type * destType)1453 Value *Nucleus::createFPExt(Value *v, Type *destType)
1454 {
1455 	RR_DEBUG_INFO_UPDATE_LOC();
1456 	return V(jit->builder->CreateFPExt(V(v), T(destType)));
1457 }
1458 
createBitCast(Value * v,Type * destType)1459 Value *Nucleus::createBitCast(Value *v, Type *destType)
1460 {
1461 	RR_DEBUG_INFO_UPDATE_LOC();
1462 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1463 	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1464 	// reading back as the destination type.
1465 	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1466 	{
1467 		Value *readAddress = allocateStackVariable(destType);
1468 		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1469 		createStore(v, writeAddress, T(V(v)->getType()));
1470 		return createLoad(readAddress, destType);
1471 	}
1472 	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1473 	{
1474 		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1475 		createStore(v, writeAddress, T(V(v)->getType()));
1476 		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1477 		return createLoad(readAddress, destType);
1478 	}
1479 
1480 	return V(jit->builder->CreateBitCast(V(v), T(destType)));
1481 }
1482 
createICmpEQ(Value * lhs,Value * rhs)1483 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1484 {
1485 	RR_DEBUG_INFO_UPDATE_LOC();
1486 	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1487 }
1488 
createICmpNE(Value * lhs,Value * rhs)1489 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1490 {
1491 	RR_DEBUG_INFO_UPDATE_LOC();
1492 	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1493 }
1494 
createICmpUGT(Value * lhs,Value * rhs)1495 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1496 {
1497 	RR_DEBUG_INFO_UPDATE_LOC();
1498 	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1499 }
1500 
createICmpUGE(Value * lhs,Value * rhs)1501 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1502 {
1503 	RR_DEBUG_INFO_UPDATE_LOC();
1504 	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1505 }
1506 
createICmpULT(Value * lhs,Value * rhs)1507 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1508 {
1509 	RR_DEBUG_INFO_UPDATE_LOC();
1510 	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1511 }
1512 
createICmpULE(Value * lhs,Value * rhs)1513 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1514 {
1515 	RR_DEBUG_INFO_UPDATE_LOC();
1516 	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1517 }
1518 
createICmpSGT(Value * lhs,Value * rhs)1519 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1520 {
1521 	RR_DEBUG_INFO_UPDATE_LOC();
1522 	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1523 }
1524 
createICmpSGE(Value * lhs,Value * rhs)1525 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1526 {
1527 	RR_DEBUG_INFO_UPDATE_LOC();
1528 	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1529 }
1530 
createICmpSLT(Value * lhs,Value * rhs)1531 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1532 {
1533 	RR_DEBUG_INFO_UPDATE_LOC();
1534 	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1535 }
1536 
createICmpSLE(Value * lhs,Value * rhs)1537 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1538 {
1539 	RR_DEBUG_INFO_UPDATE_LOC();
1540 	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1541 }
1542 
createFCmpOEQ(Value * lhs,Value * rhs)1543 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1544 {
1545 	RR_DEBUG_INFO_UPDATE_LOC();
1546 	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1547 }
1548 
createFCmpOGT(Value * lhs,Value * rhs)1549 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1550 {
1551 	RR_DEBUG_INFO_UPDATE_LOC();
1552 	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1553 }
1554 
createFCmpOGE(Value * lhs,Value * rhs)1555 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1556 {
1557 	RR_DEBUG_INFO_UPDATE_LOC();
1558 	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1559 }
1560 
createFCmpOLT(Value * lhs,Value * rhs)1561 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1562 {
1563 	RR_DEBUG_INFO_UPDATE_LOC();
1564 	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1565 }
1566 
createFCmpOLE(Value * lhs,Value * rhs)1567 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1568 {
1569 	RR_DEBUG_INFO_UPDATE_LOC();
1570 	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1571 }
1572 
createFCmpONE(Value * lhs,Value * rhs)1573 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1574 {
1575 	RR_DEBUG_INFO_UPDATE_LOC();
1576 	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1577 }
1578 
createFCmpORD(Value * lhs,Value * rhs)1579 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1580 {
1581 	RR_DEBUG_INFO_UPDATE_LOC();
1582 	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1583 }
1584 
createFCmpUNO(Value * lhs,Value * rhs)1585 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1586 {
1587 	RR_DEBUG_INFO_UPDATE_LOC();
1588 	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1589 }
1590 
createFCmpUEQ(Value * lhs,Value * rhs)1591 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1592 {
1593 	RR_DEBUG_INFO_UPDATE_LOC();
1594 	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1595 }
1596 
createFCmpUGT(Value * lhs,Value * rhs)1597 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1598 {
1599 	RR_DEBUG_INFO_UPDATE_LOC();
1600 	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1601 }
1602 
createFCmpUGE(Value * lhs,Value * rhs)1603 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1604 {
1605 	RR_DEBUG_INFO_UPDATE_LOC();
1606 	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1607 }
1608 
createFCmpULT(Value * lhs,Value * rhs)1609 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1610 {
1611 	RR_DEBUG_INFO_UPDATE_LOC();
1612 	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1613 }
1614 
createFCmpULE(Value * lhs,Value * rhs)1615 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1616 {
1617 	RR_DEBUG_INFO_UPDATE_LOC();
1618 	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1619 }
1620 
createFCmpUNE(Value * lhs,Value * rhs)1621 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1622 {
1623 	RR_DEBUG_INFO_UPDATE_LOC();
1624 	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1625 }
1626 
createExtractElement(Value * vector,Type * type,int index)1627 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1628 {
1629 	RR_DEBUG_INFO_UPDATE_LOC();
1630 	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1631 	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1632 }
1633 
createInsertElement(Value * vector,Value * element,int index)1634 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1635 {
1636 	RR_DEBUG_INFO_UPDATE_LOC();
1637 	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1638 }
1639 
createShuffleVector(Value * v1,Value * v2,const int * select)1640 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1641 {
1642 	RR_DEBUG_INFO_UPDATE_LOC();
1643 
1644 	int size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1645 	const int maxSize = 16;
1646 	llvm::Constant *swizzle[maxSize];
1647 	ASSERT(size <= maxSize);
1648 
1649 	for(int i = 0; i < size; i++)
1650 	{
1651 		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
1652 	}
1653 
1654 	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
1655 
1656 	return V(jit->builder->CreateShuffleVector(V(v1), V(v2), shuffle));
1657 }
1658 
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1659 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1660 {
1661 	RR_DEBUG_INFO_UPDATE_LOC();
1662 	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1663 }
1664 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1665 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1666 {
1667 	RR_DEBUG_INFO_UPDATE_LOC();
1668 	return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1669 }
1670 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1671 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1672 {
1673 	RR_DEBUG_INFO_UPDATE_LOC();
1674 	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1675 	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1676 }
1677 
createUnreachable()1678 void Nucleus::createUnreachable()
1679 {
1680 	RR_DEBUG_INFO_UPDATE_LOC();
1681 	jit->builder->CreateUnreachable();
1682 }
1683 
getType(Value * value)1684 Type *Nucleus::getType(Value *value)
1685 {
1686 	return T(V(value)->getType());
1687 }
1688 
getContainedType(Type * vectorType)1689 Type *Nucleus::getContainedType(Type *vectorType)
1690 {
1691 	return T(T(vectorType)->getContainedType(0));
1692 }
1693 
getPointerType(Type * ElementType)1694 Type *Nucleus::getPointerType(Type *ElementType)
1695 {
1696 	return T(llvm::PointerType::get(T(ElementType), 0));
1697 }
1698 
getNaturalIntType()1699 static llvm::Type *getNaturalIntType()
1700 {
1701 	return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1702 }
1703 
getPrintfStorageType(Type * valueType)1704 Type *Nucleus::getPrintfStorageType(Type *valueType)
1705 {
1706 	llvm::Type *valueTy = T(valueType);
1707 	if(valueTy->isIntegerTy())
1708 	{
1709 		return T(getNaturalIntType());
1710 	}
1711 	if(valueTy->isFloatTy())
1712 	{
1713 		return T(llvm::Type::getDoubleTy(*jit->context));
1714 	}
1715 
1716 	UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1717 	return {};
1718 }
1719 
createNullValue(Type * Ty)1720 Value *Nucleus::createNullValue(Type *Ty)
1721 {
1722 	RR_DEBUG_INFO_UPDATE_LOC();
1723 	return V(llvm::Constant::getNullValue(T(Ty)));
1724 }
1725 
createConstantLong(int64_t i)1726 Value *Nucleus::createConstantLong(int64_t i)
1727 {
1728 	RR_DEBUG_INFO_UPDATE_LOC();
1729 	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1730 }
1731 
createConstantInt(int i)1732 Value *Nucleus::createConstantInt(int i)
1733 {
1734 	RR_DEBUG_INFO_UPDATE_LOC();
1735 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1736 }
1737 
createConstantInt(unsigned int i)1738 Value *Nucleus::createConstantInt(unsigned int i)
1739 {
1740 	RR_DEBUG_INFO_UPDATE_LOC();
1741 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1742 }
1743 
createConstantBool(bool b)1744 Value *Nucleus::createConstantBool(bool b)
1745 {
1746 	RR_DEBUG_INFO_UPDATE_LOC();
1747 	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1748 }
1749 
createConstantByte(signed char i)1750 Value *Nucleus::createConstantByte(signed char i)
1751 {
1752 	RR_DEBUG_INFO_UPDATE_LOC();
1753 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1754 }
1755 
createConstantByte(unsigned char i)1756 Value *Nucleus::createConstantByte(unsigned char i)
1757 {
1758 	RR_DEBUG_INFO_UPDATE_LOC();
1759 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1760 }
1761 
createConstantShort(short i)1762 Value *Nucleus::createConstantShort(short i)
1763 {
1764 	RR_DEBUG_INFO_UPDATE_LOC();
1765 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1766 }
1767 
createConstantShort(unsigned short i)1768 Value *Nucleus::createConstantShort(unsigned short i)
1769 {
1770 	RR_DEBUG_INFO_UPDATE_LOC();
1771 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1772 }
1773 
createConstantFloat(float x)1774 Value *Nucleus::createConstantFloat(float x)
1775 {
1776 	RR_DEBUG_INFO_UPDATE_LOC();
1777 	return V(llvm::ConstantFP::get(T(Float::type()), x));
1778 }
1779 
createNullPointer(Type * Ty)1780 Value *Nucleus::createNullPointer(Type *Ty)
1781 {
1782 	RR_DEBUG_INFO_UPDATE_LOC();
1783 	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1784 }
1785 
createConstantVector(const int64_t * constants,Type * type)1786 Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1787 {
1788 	RR_DEBUG_INFO_UPDATE_LOC();
1789 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1790 	const int numConstants = elementCount(type);                                           // Number of provided constants for the (emulated) type.
1791 	const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1792 	ASSERT(numElements <= 16 && numConstants <= numElements);
1793 	llvm::Constant *constantVector[16];
1794 
1795 	for(int i = 0; i < numElements; i++)
1796 	{
1797 		constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1798 	}
1799 
1800 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1801 }
1802 
createConstantVector(const double * constants,Type * type)1803 Value *Nucleus::createConstantVector(const double *constants, Type *type)
1804 {
1805 	RR_DEBUG_INFO_UPDATE_LOC();
1806 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1807 	const int numConstants = elementCount(type);                                           // Number of provided constants for the (emulated) type.
1808 	const int numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1809 	ASSERT(numElements <= 8 && numConstants <= numElements);
1810 	llvm::Constant *constantVector[8];
1811 
1812 	for(int i = 0; i < numElements; i++)
1813 	{
1814 		constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1815 	}
1816 
1817 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector, numElements)));
1818 }
1819 
createConstantString(const char * v)1820 Value *Nucleus::createConstantString(const char *v)
1821 {
1822 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1823 	auto ptr = jit->builder->CreateGlobalStringPtr(v);
1824 	return V(ptr);
1825 }
1826 
setOptimizerCallback(OptimizerCallback * callback)1827 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1828 {
1829 	// The LLVM backend does not produce optimizer reports.
1830 	(void)callback;
1831 }
1832 
type()1833 Type *Void::type()
1834 {
1835 	return T(llvm::Type::getVoidTy(*jit->context));
1836 }
1837 
type()1838 Type *Bool::type()
1839 {
1840 	return T(llvm::Type::getInt1Ty(*jit->context));
1841 }
1842 
type()1843 Type *Byte::type()
1844 {
1845 	return T(llvm::Type::getInt8Ty(*jit->context));
1846 }
1847 
type()1848 Type *SByte::type()
1849 {
1850 	return T(llvm::Type::getInt8Ty(*jit->context));
1851 }
1852 
type()1853 Type *Short::type()
1854 {
1855 	return T(llvm::Type::getInt16Ty(*jit->context));
1856 }
1857 
type()1858 Type *UShort::type()
1859 {
1860 	return T(llvm::Type::getInt16Ty(*jit->context));
1861 }
1862 
type()1863 Type *Byte4::type()
1864 {
1865 	return T(Type_v4i8);
1866 }
1867 
type()1868 Type *SByte4::type()
1869 {
1870 	return T(Type_v4i8);
1871 }
1872 
AddSat(RValue<Byte8> x,RValue<Byte8> y)1873 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1874 {
1875 	RR_DEBUG_INFO_UPDATE_LOC();
1876 #if defined(__i386__) || defined(__x86_64__)
1877 	return x86::paddusb(x, y);
1878 #else
1879 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1880 #endif
1881 }
1882 
SubSat(RValue<Byte8> x,RValue<Byte8> y)1883 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1884 {
1885 	RR_DEBUG_INFO_UPDATE_LOC();
1886 #if defined(__i386__) || defined(__x86_64__)
1887 	return x86::psubusb(x, y);
1888 #else
1889 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1890 #endif
1891 }
1892 
SignMask(RValue<Byte8> x)1893 RValue<Int> SignMask(RValue<Byte8> x)
1894 {
1895 	RR_DEBUG_INFO_UPDATE_LOC();
1896 #if defined(__i386__) || defined(__x86_64__)
1897 	return x86::pmovmskb(x);
1898 #else
1899 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1900 #endif
1901 }
1902 
1903 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1904 //	{
1905 //#if defined(__i386__) || defined(__x86_64__)
1906 //		return x86::pcmpgtb(x, y);   // FIXME: Signedness
1907 //#else
1908 //		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1909 //#endif
1910 //	}
1911 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1912 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1913 {
1914 	RR_DEBUG_INFO_UPDATE_LOC();
1915 #if defined(__i386__) || defined(__x86_64__)
1916 	return x86::pcmpeqb(x, y);
1917 #else
1918 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1919 #endif
1920 }
1921 
type()1922 Type *Byte8::type()
1923 {
1924 	return T(Type_v8i8);
1925 }
1926 
AddSat(RValue<SByte8> x,RValue<SByte8> y)1927 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1928 {
1929 	RR_DEBUG_INFO_UPDATE_LOC();
1930 #if defined(__i386__) || defined(__x86_64__)
1931 	return x86::paddsb(x, y);
1932 #else
1933 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1934 #endif
1935 }
1936 
SubSat(RValue<SByte8> x,RValue<SByte8> y)1937 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1938 {
1939 	RR_DEBUG_INFO_UPDATE_LOC();
1940 #if defined(__i386__) || defined(__x86_64__)
1941 	return x86::psubsb(x, y);
1942 #else
1943 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1944 #endif
1945 }
1946 
SignMask(RValue<SByte8> x)1947 RValue<Int> SignMask(RValue<SByte8> x)
1948 {
1949 	RR_DEBUG_INFO_UPDATE_LOC();
1950 #if defined(__i386__) || defined(__x86_64__)
1951 	return x86::pmovmskb(As<Byte8>(x));
1952 #else
1953 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1954 #endif
1955 }
1956 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1957 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1958 {
1959 	RR_DEBUG_INFO_UPDATE_LOC();
1960 #if defined(__i386__) || defined(__x86_64__)
1961 	return x86::pcmpgtb(x, y);
1962 #else
1963 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1964 #endif
1965 }
1966 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1967 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1968 {
1969 	RR_DEBUG_INFO_UPDATE_LOC();
1970 #if defined(__i386__) || defined(__x86_64__)
1971 	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1972 #else
1973 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1974 #endif
1975 }
1976 
type()1977 Type *SByte8::type()
1978 {
1979 	return T(Type_v8i8);
1980 }
1981 
type()1982 Type *Byte16::type()
1983 {
1984 	return T(llvm::VectorType::get(T(Byte::type()), 16, false));
1985 }
1986 
type()1987 Type *SByte16::type()
1988 {
1989 	return T(llvm::VectorType::get(T(SByte::type()), 16, false));
1990 }
1991 
type()1992 Type *Short2::type()
1993 {
1994 	return T(Type_v2i16);
1995 }
1996 
type()1997 Type *UShort2::type()
1998 {
1999 	return T(Type_v2i16);
2000 }
2001 
Short4(RValue<Int4> cast)2002 Short4::Short4(RValue<Int4> cast)
2003 {
2004 	RR_DEBUG_INFO_UPDATE_LOC();
2005 	int select[8] = { 0, 2, 4, 6, 0, 2, 4, 6 };
2006 	Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
2007 
2008 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
2009 	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
2010 
2011 	storeValue(short4);
2012 }
2013 
2014 //	Short4::Short4(RValue<Float> cast)
2015 //	{
2016 //	}
2017 
Short4(RValue<Float4> cast)2018 Short4::Short4(RValue<Float4> cast)
2019 {
2020 	RR_DEBUG_INFO_UPDATE_LOC();
2021 	Int4 v4i32 = Int4(cast);
2022 #if defined(__i386__) || defined(__x86_64__)
2023 	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2024 #else
2025 	Value *v = v4i32.loadValue();
2026 	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2027 #endif
2028 
2029 	storeValue(As<Short4>(Int2(v4i32)).value());
2030 }
2031 
operator <<(RValue<Short4> lhs,unsigned char rhs)2032 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2033 {
2034 	RR_DEBUG_INFO_UPDATE_LOC();
2035 #if defined(__i386__) || defined(__x86_64__)
2036 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2037 
2038 	return x86::psllw(lhs, rhs);
2039 #else
2040 	return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2041 #endif
2042 }
2043 
operator >>(RValue<Short4> lhs,unsigned char rhs)2044 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2045 {
2046 	RR_DEBUG_INFO_UPDATE_LOC();
2047 #if defined(__i386__) || defined(__x86_64__)
2048 	return x86::psraw(lhs, rhs);
2049 #else
2050 	return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2051 #endif
2052 }
2053 
Max(RValue<Short4> x,RValue<Short4> y)2054 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2055 {
2056 	RR_DEBUG_INFO_UPDATE_LOC();
2057 #if defined(__i386__) || defined(__x86_64__)
2058 	return x86::pmaxsw(x, y);
2059 #else
2060 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2061 #endif
2062 }
2063 
Min(RValue<Short4> x,RValue<Short4> y)2064 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2065 {
2066 	RR_DEBUG_INFO_UPDATE_LOC();
2067 #if defined(__i386__) || defined(__x86_64__)
2068 	return x86::pminsw(x, y);
2069 #else
2070 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2071 #endif
2072 }
2073 
AddSat(RValue<Short4> x,RValue<Short4> y)2074 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2075 {
2076 	RR_DEBUG_INFO_UPDATE_LOC();
2077 #if defined(__i386__) || defined(__x86_64__)
2078 	return x86::paddsw(x, y);
2079 #else
2080 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2081 #endif
2082 }
2083 
SubSat(RValue<Short4> x,RValue<Short4> y)2084 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2085 {
2086 	RR_DEBUG_INFO_UPDATE_LOC();
2087 #if defined(__i386__) || defined(__x86_64__)
2088 	return x86::psubsw(x, y);
2089 #else
2090 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2091 #endif
2092 }
2093 
MulHigh(RValue<Short4> x,RValue<Short4> y)2094 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2095 {
2096 	RR_DEBUG_INFO_UPDATE_LOC();
2097 #if defined(__i386__) || defined(__x86_64__)
2098 	return x86::pmulhw(x, y);
2099 #else
2100 	return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2101 #endif
2102 }
2103 
MulAdd(RValue<Short4> x,RValue<Short4> y)2104 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2105 {
2106 	RR_DEBUG_INFO_UPDATE_LOC();
2107 #if defined(__i386__) || defined(__x86_64__)
2108 	return x86::pmaddwd(x, y);
2109 #else
2110 	return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2111 #endif
2112 }
2113 
PackSigned(RValue<Short4> x,RValue<Short4> y)2114 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2115 {
2116 	RR_DEBUG_INFO_UPDATE_LOC();
2117 #if defined(__i386__) || defined(__x86_64__)
2118 	auto result = x86::packsswb(x, y);
2119 #else
2120 	auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2121 #endif
2122 	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2123 }
2124 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2125 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2126 {
2127 	RR_DEBUG_INFO_UPDATE_LOC();
2128 #if defined(__i386__) || defined(__x86_64__)
2129 	auto result = x86::packuswb(x, y);
2130 #else
2131 	auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2132 #endif
2133 	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2134 }
2135 
CmpGT(RValue<Short4> x,RValue<Short4> y)2136 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2137 {
2138 	RR_DEBUG_INFO_UPDATE_LOC();
2139 #if defined(__i386__) || defined(__x86_64__)
2140 	return x86::pcmpgtw(x, y);
2141 #else
2142 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2143 #endif
2144 }
2145 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2146 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2147 {
2148 	RR_DEBUG_INFO_UPDATE_LOC();
2149 #if defined(__i386__) || defined(__x86_64__)
2150 	return x86::pcmpeqw(x, y);
2151 #else
2152 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2153 #endif
2154 }
2155 
type()2156 Type *Short4::type()
2157 {
2158 	return T(Type_v4i16);
2159 }
2160 
UShort4(RValue<Float4> cast,bool saturate)2161 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2162 {
2163 	RR_DEBUG_INFO_UPDATE_LOC();
2164 	if(saturate)
2165 	{
2166 #if defined(__i386__) || defined(__x86_64__)
2167 		if(CPUID::supportsSSE4_1())
2168 		{
2169 			Int4 int4(Min(cast, Float4(0xFFFF)));  // packusdw takes care of 0x0000 saturation
2170 			*this = As<Short4>(PackUnsigned(int4, int4));
2171 		}
2172 		else
2173 #endif
2174 		{
2175 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2176 		}
2177 	}
2178 	else
2179 	{
2180 		*this = Short4(Int4(cast));
2181 	}
2182 }
2183 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2184 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2185 {
2186 	RR_DEBUG_INFO_UPDATE_LOC();
2187 #if defined(__i386__) || defined(__x86_64__)
2188 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2189 
2190 	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2191 #else
2192 	return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2193 #endif
2194 }
2195 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2196 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2197 {
2198 	RR_DEBUG_INFO_UPDATE_LOC();
2199 #if defined(__i386__) || defined(__x86_64__)
2200 	//	return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2201 
2202 	return x86::psrlw(lhs, rhs);
2203 #else
2204 	return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2205 #endif
2206 }
2207 
Max(RValue<UShort4> x,RValue<UShort4> y)2208 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2209 {
2210 	RR_DEBUG_INFO_UPDATE_LOC();
2211 	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2212 }
2213 
Min(RValue<UShort4> x,RValue<UShort4> y)2214 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2215 {
2216 	RR_DEBUG_INFO_UPDATE_LOC();
2217 	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2218 }
2219 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2220 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2221 {
2222 	RR_DEBUG_INFO_UPDATE_LOC();
2223 #if defined(__i386__) || defined(__x86_64__)
2224 	return x86::paddusw(x, y);
2225 #else
2226 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2227 #endif
2228 }
2229 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2230 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2231 {
2232 	RR_DEBUG_INFO_UPDATE_LOC();
2233 #if defined(__i386__) || defined(__x86_64__)
2234 	return x86::psubusw(x, y);
2235 #else
2236 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2237 #endif
2238 }
2239 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2240 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2241 {
2242 	RR_DEBUG_INFO_UPDATE_LOC();
2243 #if defined(__i386__) || defined(__x86_64__)
2244 	return x86::pmulhuw(x, y);
2245 #else
2246 	return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2247 #endif
2248 }
2249 
Average(RValue<UShort4> x,RValue<UShort4> y)2250 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2251 {
2252 	RR_DEBUG_INFO_UPDATE_LOC();
2253 #if defined(__i386__) || defined(__x86_64__)
2254 	return x86::pavgw(x, y);
2255 #else
2256 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2257 #endif
2258 }
2259 
type()2260 Type *UShort4::type()
2261 {
2262 	return T(Type_v4i16);
2263 }
2264 
operator <<(RValue<Short8> lhs,unsigned char rhs)2265 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2266 {
2267 	RR_DEBUG_INFO_UPDATE_LOC();
2268 #if defined(__i386__) || defined(__x86_64__)
2269 	return x86::psllw(lhs, rhs);
2270 #else
2271 	return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2272 #endif
2273 }
2274 
operator >>(RValue<Short8> lhs,unsigned char rhs)2275 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2276 {
2277 	RR_DEBUG_INFO_UPDATE_LOC();
2278 #if defined(__i386__) || defined(__x86_64__)
2279 	return x86::psraw(lhs, rhs);
2280 #else
2281 	return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2282 #endif
2283 }
2284 
MulAdd(RValue<Short8> x,RValue<Short8> y)2285 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2286 {
2287 	RR_DEBUG_INFO_UPDATE_LOC();
2288 #if defined(__i386__) || defined(__x86_64__)
2289 	return x86::pmaddwd(x, y);
2290 #else
2291 	return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2292 #endif
2293 }
2294 
MulHigh(RValue<Short8> x,RValue<Short8> y)2295 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2296 {
2297 	RR_DEBUG_INFO_UPDATE_LOC();
2298 #if defined(__i386__) || defined(__x86_64__)
2299 	return x86::pmulhw(x, y);
2300 #else
2301 	return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2302 #endif
2303 }
2304 
type()2305 Type *Short8::type()
2306 {
2307 	return T(llvm::VectorType::get(T(Short::type()), 8, false));
2308 }
2309 
operator <<(RValue<UShort8> lhs,unsigned char rhs)2310 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2311 {
2312 	RR_DEBUG_INFO_UPDATE_LOC();
2313 #if defined(__i386__) || defined(__x86_64__)
2314 	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2315 #else
2316 	return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2317 #endif
2318 }
2319 
operator >>(RValue<UShort8> lhs,unsigned char rhs)2320 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2321 {
2322 	RR_DEBUG_INFO_UPDATE_LOC();
2323 #if defined(__i386__) || defined(__x86_64__)
2324 	return x86::psrlw(lhs, rhs);  // FIXME: Fallback required
2325 #else
2326 	return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2327 #endif
2328 }
2329 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2330 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2331 {
2332 	RR_DEBUG_INFO_UPDATE_LOC();
2333 #if defined(__i386__) || defined(__x86_64__)
2334 	return x86::pmulhuw(x, y);
2335 #else
2336 	return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2337 #endif
2338 }
2339 
type()2340 Type *UShort8::type()
2341 {
2342 	return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2343 }
2344 
operator ++(Int & val,int)2345 RValue<Int> operator++(Int &val, int)  // Post-increment
2346 {
2347 	RR_DEBUG_INFO_UPDATE_LOC();
2348 	RValue<Int> res = val;
2349 
2350 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2351 	val.storeValue(inc);
2352 
2353 	return res;
2354 }
2355 
operator ++(Int & val)2356 const Int &operator++(Int &val)  // Pre-increment
2357 {
2358 	RR_DEBUG_INFO_UPDATE_LOC();
2359 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2360 	val.storeValue(inc);
2361 
2362 	return val;
2363 }
2364 
operator --(Int & val,int)2365 RValue<Int> operator--(Int &val, int)  // Post-decrement
2366 {
2367 	RR_DEBUG_INFO_UPDATE_LOC();
2368 	RValue<Int> res = val;
2369 
2370 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2371 	val.storeValue(inc);
2372 
2373 	return res;
2374 }
2375 
operator --(Int & val)2376 const Int &operator--(Int &val)  // Pre-decrement
2377 {
2378 	RR_DEBUG_INFO_UPDATE_LOC();
2379 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2380 	val.storeValue(inc);
2381 
2382 	return val;
2383 }
2384 
RoundInt(RValue<Float> cast)2385 RValue<Int> RoundInt(RValue<Float> cast)
2386 {
2387 	RR_DEBUG_INFO_UPDATE_LOC();
2388 #if defined(__i386__) || defined(__x86_64__)
2389 	return x86::cvtss2si(cast);
2390 #else
2391 	return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2392 #endif
2393 }
2394 
type()2395 Type *Int::type()
2396 {
2397 	return T(llvm::Type::getInt32Ty(*jit->context));
2398 }
2399 
type()2400 Type *Long::type()
2401 {
2402 	return T(llvm::Type::getInt64Ty(*jit->context));
2403 }
2404 
UInt(RValue<Float> cast)2405 UInt::UInt(RValue<Float> cast)
2406 {
2407 	RR_DEBUG_INFO_UPDATE_LOC();
2408 	Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2409 	storeValue(integer);
2410 }
2411 
operator ++(UInt & val,int)2412 RValue<UInt> operator++(UInt &val, int)  // Post-increment
2413 {
2414 	RR_DEBUG_INFO_UPDATE_LOC();
2415 	RValue<UInt> res = val;
2416 
2417 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2418 	val.storeValue(inc);
2419 
2420 	return res;
2421 }
2422 
operator ++(UInt & val)2423 const UInt &operator++(UInt &val)  // Pre-increment
2424 {
2425 	RR_DEBUG_INFO_UPDATE_LOC();
2426 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2427 	val.storeValue(inc);
2428 
2429 	return val;
2430 }
2431 
operator --(UInt & val,int)2432 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
2433 {
2434 	RR_DEBUG_INFO_UPDATE_LOC();
2435 	RValue<UInt> res = val;
2436 
2437 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2438 	val.storeValue(inc);
2439 
2440 	return res;
2441 }
2442 
operator --(UInt & val)2443 const UInt &operator--(UInt &val)  // Pre-decrement
2444 {
2445 	RR_DEBUG_INFO_UPDATE_LOC();
2446 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2447 	val.storeValue(inc);
2448 
2449 	return val;
2450 }
2451 
2452 //	RValue<UInt> RoundUInt(RValue<Float> cast)
2453 //	{
2454 //#if defined(__i386__) || defined(__x86_64__)
2455 //		return x86::cvtss2si(val);   // FIXME: Unsigned
2456 //#else
2457 //		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2458 //#endif
2459 //	}
2460 
type()2461 Type *UInt::type()
2462 {
2463 	return T(llvm::Type::getInt32Ty(*jit->context));
2464 }
2465 
2466 //	Int2::Int2(RValue<Int> cast)
2467 //	{
2468 //		Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2469 //		Value *vector = Nucleus::createBitCast(extend, Int2::type());
2470 //
2471 //		int shuffle[2] = {0, 0};
2472 //		Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2473 //
2474 //		storeValue(replicate);
2475 //	}
2476 
operator <<(RValue<Int2> lhs,unsigned char rhs)2477 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2478 {
2479 	RR_DEBUG_INFO_UPDATE_LOC();
2480 #if defined(__i386__) || defined(__x86_64__)
2481 	//	return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2482 
2483 	return x86::pslld(lhs, rhs);
2484 #else
2485 	return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2486 #endif
2487 }
2488 
operator >>(RValue<Int2> lhs,unsigned char rhs)2489 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2490 {
2491 	RR_DEBUG_INFO_UPDATE_LOC();
2492 #if defined(__i386__) || defined(__x86_64__)
2493 	//	return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2494 
2495 	return x86::psrad(lhs, rhs);
2496 #else
2497 	return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2498 #endif
2499 }
2500 
type()2501 Type *Int2::type()
2502 {
2503 	return T(Type_v2i32);
2504 }
2505 
operator <<(RValue<UInt2> lhs,unsigned char rhs)2506 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2507 {
2508 	RR_DEBUG_INFO_UPDATE_LOC();
2509 #if defined(__i386__) || defined(__x86_64__)
2510 	//	return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2511 
2512 	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2513 #else
2514 	return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2515 #endif
2516 }
2517 
operator >>(RValue<UInt2> lhs,unsigned char rhs)2518 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2519 {
2520 	RR_DEBUG_INFO_UPDATE_LOC();
2521 #if defined(__i386__) || defined(__x86_64__)
2522 	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2523 
2524 	return x86::psrld(lhs, rhs);
2525 #else
2526 	return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2527 #endif
2528 }
2529 
type()2530 Type *UInt2::type()
2531 {
2532 	return T(Type_v2i32);
2533 }
2534 
Int4(RValue<Byte4> cast)2535 Int4::Int4(RValue<Byte4> cast)
2536     : XYZW(this)
2537 {
2538 	RR_DEBUG_INFO_UPDATE_LOC();
2539 #if defined(__i386__) || defined(__x86_64__)
2540 	if(CPUID::supportsSSE4_1())
2541 	{
2542 		*this = x86::pmovzxbd(As<Byte16>(cast));
2543 	}
2544 	else
2545 #endif
2546 	{
2547 		int swizzle[16] = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2548 		Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2549 		Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2550 
2551 		int swizzle2[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2552 		Value *c = Nucleus::createBitCast(b, Short8::type());
2553 		Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2554 
2555 		*this = As<Int4>(d);
2556 	}
2557 }
2558 
Int4(RValue<SByte4> cast)2559 Int4::Int4(RValue<SByte4> cast)
2560     : XYZW(this)
2561 {
2562 	RR_DEBUG_INFO_UPDATE_LOC();
2563 #if defined(__i386__) || defined(__x86_64__)
2564 	if(CPUID::supportsSSE4_1())
2565 	{
2566 		*this = x86::pmovsxbd(As<SByte16>(cast));
2567 	}
2568 	else
2569 #endif
2570 	{
2571 		int swizzle[16] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2572 		Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2573 		Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2574 
2575 		int swizzle2[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2576 		Value *c = Nucleus::createBitCast(b, Short8::type());
2577 		Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2578 
2579 		*this = As<Int4>(d) >> 24;
2580 	}
2581 }
2582 
Int4(RValue<Short4> cast)2583 Int4::Int4(RValue<Short4> cast)
2584     : XYZW(this)
2585 {
2586 	RR_DEBUG_INFO_UPDATE_LOC();
2587 #if defined(__i386__) || defined(__x86_64__)
2588 	if(CPUID::supportsSSE4_1())
2589 	{
2590 		*this = x86::pmovsxwd(As<Short8>(cast));
2591 	}
2592 	else
2593 #endif
2594 	{
2595 		int swizzle[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
2596 		Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2597 		*this = As<Int4>(c) >> 16;
2598 	}
2599 }
2600 
Int4(RValue<UShort4> cast)2601 Int4::Int4(RValue<UShort4> cast)
2602     : XYZW(this)
2603 {
2604 	RR_DEBUG_INFO_UPDATE_LOC();
2605 #if defined(__i386__) || defined(__x86_64__)
2606 	if(CPUID::supportsSSE4_1())
2607 	{
2608 		*this = x86::pmovzxwd(As<UShort8>(cast));
2609 	}
2610 	else
2611 #endif
2612 	{
2613 		int swizzle[8] = { 0, 8, 1, 9, 2, 10, 3, 11 };
2614 		Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2615 		*this = As<Int4>(c);
2616 	}
2617 }
2618 
Int4(RValue<Int> rhs)2619 Int4::Int4(RValue<Int> rhs)
2620     : XYZW(this)
2621 {
2622 	RR_DEBUG_INFO_UPDATE_LOC();
2623 	Value *vector = loadValue();
2624 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2625 
2626 	int swizzle[4] = { 0, 0, 0, 0 };
2627 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2628 
2629 	storeValue(replicate);
2630 }
2631 
operator <<(RValue<Int4> lhs,unsigned char rhs)2632 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2633 {
2634 	RR_DEBUG_INFO_UPDATE_LOC();
2635 #if defined(__i386__) || defined(__x86_64__)
2636 	return x86::pslld(lhs, rhs);
2637 #else
2638 	return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2639 #endif
2640 }
2641 
operator >>(RValue<Int4> lhs,unsigned char rhs)2642 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2643 {
2644 	RR_DEBUG_INFO_UPDATE_LOC();
2645 #if defined(__i386__) || defined(__x86_64__)
2646 	return x86::psrad(lhs, rhs);
2647 #else
2648 	return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2649 #endif
2650 }
2651 
CmpEQ(RValue<Int4> x,RValue<Int4> y)2652 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2653 {
2654 	RR_DEBUG_INFO_UPDATE_LOC();
2655 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2656 }
2657 
CmpLT(RValue<Int4> x,RValue<Int4> y)2658 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2659 {
2660 	RR_DEBUG_INFO_UPDATE_LOC();
2661 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2662 }
2663 
CmpLE(RValue<Int4> x,RValue<Int4> y)2664 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2665 {
2666 	RR_DEBUG_INFO_UPDATE_LOC();
2667 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2668 }
2669 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2670 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2671 {
2672 	RR_DEBUG_INFO_UPDATE_LOC();
2673 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2674 }
2675 
CmpNLT(RValue<Int4> x,RValue<Int4> y)2676 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2677 {
2678 	RR_DEBUG_INFO_UPDATE_LOC();
2679 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2680 }
2681 
CmpNLE(RValue<Int4> x,RValue<Int4> y)2682 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2683 {
2684 	RR_DEBUG_INFO_UPDATE_LOC();
2685 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2686 }
2687 
Max(RValue<Int4> x,RValue<Int4> y)2688 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2689 {
2690 	RR_DEBUG_INFO_UPDATE_LOC();
2691 #if defined(__i386__) || defined(__x86_64__)
2692 	if(CPUID::supportsSSE4_1())
2693 	{
2694 		return x86::pmaxsd(x, y);
2695 	}
2696 	else
2697 #endif
2698 	{
2699 		RValue<Int4> greater = CmpNLE(x, y);
2700 		return (x & greater) | (y & ~greater);
2701 	}
2702 }
2703 
Min(RValue<Int4> x,RValue<Int4> y)2704 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2705 {
2706 	RR_DEBUG_INFO_UPDATE_LOC();
2707 #if defined(__i386__) || defined(__x86_64__)
2708 	if(CPUID::supportsSSE4_1())
2709 	{
2710 		return x86::pminsd(x, y);
2711 	}
2712 	else
2713 #endif
2714 	{
2715 		RValue<Int4> less = CmpLT(x, y);
2716 		return (x & less) | (y & ~less);
2717 	}
2718 }
2719 
RoundInt(RValue<Float4> cast)2720 RValue<Int4> RoundInt(RValue<Float4> cast)
2721 {
2722 	RR_DEBUG_INFO_UPDATE_LOC();
2723 #if defined(__i386__) || defined(__x86_64__)
2724 	return x86::cvtps2dq(cast);
2725 #else
2726 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2727 #endif
2728 }
2729 
RoundIntClamped(RValue<Float4> cast)2730 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2731 {
2732 	RR_DEBUG_INFO_UPDATE_LOC();
2733 #if defined(__i386__) || defined(__x86_64__)
2734 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
2735 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2736 	// saturate to 0x80000000.
2737 	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2738 #else
2739 	// ARM saturates to the largest positive or negative integer. Unit tests
2740 	// verify that lowerRoundInt() behaves as desired.
2741 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2742 #endif
2743 }
2744 
MulHigh(RValue<Int4> x,RValue<Int4> y)2745 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2746 {
2747 	RR_DEBUG_INFO_UPDATE_LOC();
2748 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2749 	return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2750 }
2751 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2752 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2753 {
2754 	RR_DEBUG_INFO_UPDATE_LOC();
2755 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2756 	return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2757 }
2758 
PackSigned(RValue<Int4> x,RValue<Int4> y)2759 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2760 {
2761 	RR_DEBUG_INFO_UPDATE_LOC();
2762 #if defined(__i386__) || defined(__x86_64__)
2763 	return x86::packssdw(x, y);
2764 #else
2765 	return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2766 #endif
2767 }
2768 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2769 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2770 {
2771 	RR_DEBUG_INFO_UPDATE_LOC();
2772 #if defined(__i386__) || defined(__x86_64__)
2773 	return x86::packusdw(x, y);
2774 #else
2775 	return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2776 #endif
2777 }
2778 
SignMask(RValue<Int4> x)2779 RValue<Int> SignMask(RValue<Int4> x)
2780 {
2781 	RR_DEBUG_INFO_UPDATE_LOC();
2782 #if defined(__i386__) || defined(__x86_64__)
2783 	return x86::movmskps(As<Float4>(x));
2784 #else
2785 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2786 #endif
2787 }
2788 
type()2789 Type *Int4::type()
2790 {
2791 	return T(llvm::VectorType::get(T(Int::type()), 4, false));
2792 }
2793 
UInt4(RValue<Float4> cast)2794 UInt4::UInt4(RValue<Float4> cast)
2795     : XYZW(this)
2796 {
2797 	RR_DEBUG_INFO_UPDATE_LOC();
2798 	Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2799 	storeValue(xyzw);
2800 }
2801 
UInt4(RValue<UInt> rhs)2802 UInt4::UInt4(RValue<UInt> rhs)
2803     : XYZW(this)
2804 {
2805 	RR_DEBUG_INFO_UPDATE_LOC();
2806 	Value *vector = loadValue();
2807 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2808 
2809 	int swizzle[4] = { 0, 0, 0, 0 };
2810 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2811 
2812 	storeValue(replicate);
2813 }
2814 
operator <<(RValue<UInt4> lhs,unsigned char rhs)2815 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2816 {
2817 	RR_DEBUG_INFO_UPDATE_LOC();
2818 #if defined(__i386__) || defined(__x86_64__)
2819 	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2820 #else
2821 	return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2822 #endif
2823 }
2824 
operator >>(RValue<UInt4> lhs,unsigned char rhs)2825 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2826 {
2827 	RR_DEBUG_INFO_UPDATE_LOC();
2828 #if defined(__i386__) || defined(__x86_64__)
2829 	return x86::psrld(lhs, rhs);
2830 #else
2831 	return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2832 #endif
2833 }
2834 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2835 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2836 {
2837 	RR_DEBUG_INFO_UPDATE_LOC();
2838 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2839 }
2840 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2841 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2842 {
2843 	RR_DEBUG_INFO_UPDATE_LOC();
2844 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2845 }
2846 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2847 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2848 {
2849 	RR_DEBUG_INFO_UPDATE_LOC();
2850 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2851 }
2852 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2853 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2854 {
2855 	RR_DEBUG_INFO_UPDATE_LOC();
2856 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2857 }
2858 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2859 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2860 {
2861 	RR_DEBUG_INFO_UPDATE_LOC();
2862 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2863 }
2864 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2865 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2866 {
2867 	RR_DEBUG_INFO_UPDATE_LOC();
2868 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2869 }
2870 
Max(RValue<UInt4> x,RValue<UInt4> y)2871 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2872 {
2873 	RR_DEBUG_INFO_UPDATE_LOC();
2874 #if defined(__i386__) || defined(__x86_64__)
2875 	if(CPUID::supportsSSE4_1())
2876 	{
2877 		return x86::pmaxud(x, y);
2878 	}
2879 	else
2880 #endif
2881 	{
2882 		RValue<UInt4> greater = CmpNLE(x, y);
2883 		return (x & greater) | (y & ~greater);
2884 	}
2885 }
2886 
Min(RValue<UInt4> x,RValue<UInt4> y)2887 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2888 {
2889 	RR_DEBUG_INFO_UPDATE_LOC();
2890 #if defined(__i386__) || defined(__x86_64__)
2891 	if(CPUID::supportsSSE4_1())
2892 	{
2893 		return x86::pminud(x, y);
2894 	}
2895 	else
2896 #endif
2897 	{
2898 		RValue<UInt4> less = CmpLT(x, y);
2899 		return (x & less) | (y & ~less);
2900 	}
2901 }
2902 
type()2903 Type *UInt4::type()
2904 {
2905 	return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2906 }
2907 
type()2908 Type *Half::type()
2909 {
2910 	return T(llvm::Type::getInt16Ty(*jit->context));
2911 }
2912 
Rcp_pp(RValue<Float> x,bool exactAtPow2)2913 RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2914 {
2915 	RR_DEBUG_INFO_UPDATE_LOC();
2916 #if defined(__i386__) || defined(__x86_64__)
2917 	if(exactAtPow2)
2918 	{
2919 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2920 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2921 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2922 	}
2923 	return x86::rcpss(x);
2924 #else
2925 	return As<Float>(V(lowerRCP(V(x.value()))));
2926 #endif
2927 }
2928 
RcpSqrt_pp(RValue<Float> x)2929 RValue<Float> RcpSqrt_pp(RValue<Float> x)
2930 {
2931 	RR_DEBUG_INFO_UPDATE_LOC();
2932 #if defined(__i386__) || defined(__x86_64__)
2933 	return x86::rsqrtss(x);
2934 #else
2935 	return As<Float>(V(lowerRSQRT(V(x.value()))));
2936 #endif
2937 }
2938 
HasRcpApprox()2939 bool HasRcpApprox()
2940 {
2941 #if defined(__i386__) || defined(__x86_64__)
2942 	return true;
2943 #else
2944 	return false;
2945 #endif
2946 }
2947 
RcpApprox(RValue<Float4> x,bool exactAtPow2)2948 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2949 {
2950 #if defined(__i386__) || defined(__x86_64__)
2951 	if(exactAtPow2)
2952 	{
2953 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
2954 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2955 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2956 	}
2957 	return x86::rcpps(x);
2958 #else
2959 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2960 	return { 0.0f };
2961 #endif
2962 }
2963 
RcpApprox(RValue<Float> x,bool exactAtPow2)2964 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2965 {
2966 #if defined(__i386__) || defined(__x86_64__)
2967 	if(exactAtPow2)
2968 	{
2969 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2970 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2971 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2972 	}
2973 	return x86::rcpss(x);
2974 #else
2975 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2976 	return { 0.0f };
2977 #endif
2978 }
2979 
HasRcpSqrtApprox()2980 bool HasRcpSqrtApprox()
2981 {
2982 #if defined(__i386__) || defined(__x86_64__)
2983 	return true;
2984 #else
2985 	return false;
2986 #endif
2987 }
2988 
RcpSqrtApprox(RValue<Float4> x)2989 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
2990 {
2991 #if defined(__i386__) || defined(__x86_64__)
2992 	return x86::rsqrtps(x);
2993 #else
2994 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2995 	return { 0.0f };
2996 #endif
2997 }
2998 
RcpSqrtApprox(RValue<Float> x)2999 RValue<Float> RcpSqrtApprox(RValue<Float> x)
3000 {
3001 #if defined(__i386__) || defined(__x86_64__)
3002 	return x86::rsqrtss(x);
3003 #else
3004 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
3005 	return { 0.0f };
3006 #endif
3007 }
3008 
Sqrt(RValue<Float> x)3009 RValue<Float> Sqrt(RValue<Float> x)
3010 {
3011 	RR_DEBUG_INFO_UPDATE_LOC();
3012 #if defined(__i386__) || defined(__x86_64__)
3013 	return x86::sqrtss(x);
3014 #else
3015 	return As<Float>(V(lowerSQRT(V(x.value()))));
3016 #endif
3017 }
3018 
Round(RValue<Float> x)3019 RValue<Float> Round(RValue<Float> x)
3020 {
3021 	RR_DEBUG_INFO_UPDATE_LOC();
3022 #if defined(__i386__) || defined(__x86_64__)
3023 	if(CPUID::supportsSSE4_1())
3024 	{
3025 		return x86::roundss(x, 0);
3026 	}
3027 	else
3028 	{
3029 		return Float4(Round(Float4(x))).x;
3030 	}
3031 #else
3032 	return RValue<Float>(V(lowerRound(V(x.value()))));
3033 #endif
3034 }
3035 
Trunc(RValue<Float> x)3036 RValue<Float> Trunc(RValue<Float> x)
3037 {
3038 	RR_DEBUG_INFO_UPDATE_LOC();
3039 #if defined(__i386__) || defined(__x86_64__)
3040 	if(CPUID::supportsSSE4_1())
3041 	{
3042 		return x86::roundss(x, 3);
3043 	}
3044 	else
3045 	{
3046 		return Float(Int(x));  // Rounded toward zero
3047 	}
3048 #else
3049 	return RValue<Float>(V(lowerTrunc(V(x.value()))));
3050 #endif
3051 }
3052 
Frac(RValue<Float> x)3053 RValue<Float> Frac(RValue<Float> x)
3054 {
3055 	RR_DEBUG_INFO_UPDATE_LOC();
3056 #if defined(__i386__) || defined(__x86_64__)
3057 	if(CPUID::supportsSSE4_1())
3058 	{
3059 		return x - x86::floorss(x);
3060 	}
3061 	else
3062 	{
3063 		return Float4(Frac(Float4(x))).x;
3064 	}
3065 #else
3066 	// x - floor(x) can be 1.0 for very small negative x.
3067 	// Clamp against the value just below 1.0.
3068 	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3069 #endif
3070 }
3071 
Floor(RValue<Float> x)3072 RValue<Float> Floor(RValue<Float> x)
3073 {
3074 	RR_DEBUG_INFO_UPDATE_LOC();
3075 #if defined(__i386__) || defined(__x86_64__)
3076 	if(CPUID::supportsSSE4_1())
3077 	{
3078 		return x86::floorss(x);
3079 	}
3080 	else
3081 	{
3082 		return Float4(Floor(Float4(x))).x;
3083 	}
3084 #else
3085 	return RValue<Float>(V(lowerFloor(V(x.value()))));
3086 #endif
3087 }
3088 
Ceil(RValue<Float> x)3089 RValue<Float> Ceil(RValue<Float> x)
3090 {
3091 	RR_DEBUG_INFO_UPDATE_LOC();
3092 #if defined(__i386__) || defined(__x86_64__)
3093 	if(CPUID::supportsSSE4_1())
3094 	{
3095 		return x86::ceilss(x);
3096 	}
3097 	else
3098 #endif
3099 	{
3100 		return Float4(Ceil(Float4(x))).x;
3101 	}
3102 }
3103 
type()3104 Type *Float::type()
3105 {
3106 	return T(llvm::Type::getFloatTy(*jit->context));
3107 }
3108 
type()3109 Type *Float2::type()
3110 {
3111 	return T(Type_v2f32);
3112 }
3113 
Exp2(RValue<Float> v)3114 RValue<Float> Exp2(RValue<Float> v)
3115 {
3116 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float::type()) });
3117 	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3118 }
3119 
Log2(RValue<Float> v)3120 RValue<Float> Log2(RValue<Float> v)
3121 {
3122 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float::type()) });
3123 	return RValue<Float>(V(jit->builder->CreateCall(func, V(v.value()))));
3124 }
3125 
Float4(RValue<Float> rhs)3126 Float4::Float4(RValue<Float> rhs)
3127     : XYZW(this)
3128 {
3129 	RR_DEBUG_INFO_UPDATE_LOC();
3130 	Value *vector = loadValue();
3131 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3132 
3133 	int swizzle[4] = { 0, 0, 0, 0 };
3134 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3135 
3136 	storeValue(replicate);
3137 }
3138 
Max(RValue<Float4> x,RValue<Float4> y)3139 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3140 {
3141 	RR_DEBUG_INFO_UPDATE_LOC();
3142 #if defined(__i386__) || defined(__x86_64__)
3143 	return x86::maxps(x, y);
3144 #else
3145 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3146 #endif
3147 }
3148 
Min(RValue<Float4> x,RValue<Float4> y)3149 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3150 {
3151 	RR_DEBUG_INFO_UPDATE_LOC();
3152 #if defined(__i386__) || defined(__x86_64__)
3153 	return x86::minps(x, y);
3154 #else
3155 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3156 #endif
3157 }
3158 
Rcp_pp(RValue<Float4> x,bool exactAtPow2)3159 RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3160 {
3161 	RR_DEBUG_INFO_UPDATE_LOC();
3162 #if defined(__i386__) || defined(__x86_64__)
3163 	if(exactAtPow2)
3164 	{
3165 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
3166 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
3167 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
3168 	}
3169 	return x86::rcpps(x);
3170 #else
3171 	return As<Float4>(V(lowerRCP(V(x.value()))));
3172 #endif
3173 }
3174 
RcpSqrt_pp(RValue<Float4> x)3175 RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3176 {
3177 	RR_DEBUG_INFO_UPDATE_LOC();
3178 #if defined(__i386__) || defined(__x86_64__)
3179 	return x86::rsqrtps(x);
3180 #else
3181 	return As<Float4>(V(lowerRSQRT(V(x.value()))));
3182 #endif
3183 }
3184 
Sqrt(RValue<Float4> x)3185 RValue<Float4> Sqrt(RValue<Float4> x)
3186 {
3187 	RR_DEBUG_INFO_UPDATE_LOC();
3188 #if defined(__i386__) || defined(__x86_64__)
3189 	return x86::sqrtps(x);
3190 #else
3191 	return As<Float4>(V(lowerSQRT(V(x.value()))));
3192 #endif
3193 }
3194 
SignMask(RValue<Float4> x)3195 RValue<Int> SignMask(RValue<Float4> x)
3196 {
3197 	RR_DEBUG_INFO_UPDATE_LOC();
3198 #if defined(__i386__) || defined(__x86_64__)
3199 	return x86::movmskps(x);
3200 #else
3201 	return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3202 #endif
3203 }
3204 
CmpEQ(RValue<Float4> x,RValue<Float4> y)3205 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3206 {
3207 	RR_DEBUG_INFO_UPDATE_LOC();
3208 	//	return As<Int4>(x86::cmpeqps(x, y));
3209 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3210 }
3211 
CmpLT(RValue<Float4> x,RValue<Float4> y)3212 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3213 {
3214 	RR_DEBUG_INFO_UPDATE_LOC();
3215 	//	return As<Int4>(x86::cmpltps(x, y));
3216 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3217 }
3218 
CmpLE(RValue<Float4> x,RValue<Float4> y)3219 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3220 {
3221 	RR_DEBUG_INFO_UPDATE_LOC();
3222 	//	return As<Int4>(x86::cmpleps(x, y));
3223 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3224 }
3225 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3226 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3227 {
3228 	RR_DEBUG_INFO_UPDATE_LOC();
3229 	//	return As<Int4>(x86::cmpneqps(x, y));
3230 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3231 }
3232 
CmpNLT(RValue<Float4> x,RValue<Float4> y)3233 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3234 {
3235 	RR_DEBUG_INFO_UPDATE_LOC();
3236 	//	return As<Int4>(x86::cmpnltps(x, y));
3237 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3238 }
3239 
CmpNLE(RValue<Float4> x,RValue<Float4> y)3240 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3241 {
3242 	RR_DEBUG_INFO_UPDATE_LOC();
3243 	//	return As<Int4>(x86::cmpnleps(x, y));
3244 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3245 }
3246 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3247 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3248 {
3249 	RR_DEBUG_INFO_UPDATE_LOC();
3250 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3251 }
3252 
CmpULT(RValue<Float4> x,RValue<Float4> y)3253 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3254 {
3255 	RR_DEBUG_INFO_UPDATE_LOC();
3256 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3257 }
3258 
CmpULE(RValue<Float4> x,RValue<Float4> y)3259 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3260 {
3261 	RR_DEBUG_INFO_UPDATE_LOC();
3262 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3263 }
3264 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3265 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3266 {
3267 	RR_DEBUG_INFO_UPDATE_LOC();
3268 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3269 }
3270 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3271 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3272 {
3273 	RR_DEBUG_INFO_UPDATE_LOC();
3274 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3275 }
3276 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3277 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3278 {
3279 	RR_DEBUG_INFO_UPDATE_LOC();
3280 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3281 }
3282 
Round(RValue<Float4> x)3283 RValue<Float4> Round(RValue<Float4> x)
3284 {
3285 	RR_DEBUG_INFO_UPDATE_LOC();
3286 #if defined(__i386__) || defined(__x86_64__)
3287 	if(CPUID::supportsSSE4_1())
3288 	{
3289 		return x86::roundps(x, 0);
3290 	}
3291 	else
3292 	{
3293 		return Float4(RoundInt(x));
3294 	}
3295 #else
3296 	return RValue<Float4>(V(lowerRound(V(x.value()))));
3297 #endif
3298 }
3299 
Trunc(RValue<Float4> x)3300 RValue<Float4> Trunc(RValue<Float4> x)
3301 {
3302 	RR_DEBUG_INFO_UPDATE_LOC();
3303 #if defined(__i386__) || defined(__x86_64__)
3304 	if(CPUID::supportsSSE4_1())
3305 	{
3306 		return x86::roundps(x, 3);
3307 	}
3308 	else
3309 	{
3310 		return Float4(Int4(x));
3311 	}
3312 #else
3313 	return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3314 #endif
3315 }
3316 
Frac(RValue<Float4> x)3317 RValue<Float4> Frac(RValue<Float4> x)
3318 {
3319 	RR_DEBUG_INFO_UPDATE_LOC();
3320 	Float4 frc;
3321 
3322 #if defined(__i386__) || defined(__x86_64__)
3323 	if(CPUID::supportsSSE4_1())
3324 	{
3325 		frc = x - Floor(x);
3326 	}
3327 	else
3328 	{
3329 		frc = x - Float4(Int4(x));  // Signed fractional part.
3330 
3331 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));  // Add 1.0 if negative.
3332 	}
3333 #else
3334 	frc = x - Floor(x);
3335 #endif
3336 
3337 	// x - floor(x) can be 1.0 for very small negative x.
3338 	// Clamp against the value just below 1.0.
3339 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3340 }
3341 
Floor(RValue<Float4> x)3342 RValue<Float4> Floor(RValue<Float4> x)
3343 {
3344 	RR_DEBUG_INFO_UPDATE_LOC();
3345 #if defined(__i386__) || defined(__x86_64__)
3346 	if(CPUID::supportsSSE4_1())
3347 	{
3348 		return x86::floorps(x);
3349 	}
3350 	else
3351 	{
3352 		return x - Frac(x);
3353 	}
3354 #else
3355 	return RValue<Float4>(V(lowerFloor(V(x.value()))));
3356 #endif
3357 }
3358 
Ceil(RValue<Float4> x)3359 RValue<Float4> Ceil(RValue<Float4> x)
3360 {
3361 	RR_DEBUG_INFO_UPDATE_LOC();
3362 #if defined(__i386__) || defined(__x86_64__)
3363 	if(CPUID::supportsSSE4_1())
3364 	{
3365 		return x86::ceilps(x);
3366 	}
3367 	else
3368 #endif
3369 	{
3370 		return -Floor(-x);
3371 	}
3372 }
3373 
Sin(RValue<Float4> v)3374 RValue<Float4> Sin(RValue<Float4> v)
3375 {
3376 	RR_DEBUG_INFO_UPDATE_LOC();
3377 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::sin, { V(v.value())->getType() });
3378 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3379 }
3380 
Cos(RValue<Float4> v)3381 RValue<Float4> Cos(RValue<Float4> v)
3382 {
3383 	RR_DEBUG_INFO_UPDATE_LOC();
3384 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cos, { V(v.value())->getType() });
3385 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3386 }
3387 
Tan(RValue<Float4> v)3388 RValue<Float4> Tan(RValue<Float4> v)
3389 {
3390 	RR_DEBUG_INFO_UPDATE_LOC();
3391 	return Sin(v) / Cos(v);
3392 }
3393 
TransformFloat4PerElement(RValue<Float4> v,const char * name)3394 static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *name)
3395 {
3396 	auto funcTy = llvm::FunctionType::get(T(Float::type()), llvm::ArrayRef<llvm::Type *>(T(Float::type())), false);
3397 	auto func = jit->module->getOrInsertFunction(name, funcTy);
3398 	llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3399 	for(uint64_t i = 0; i < 4; i++)
3400 	{
3401 		auto el = jit->builder->CreateCall(func, V(Nucleus::createExtractElement(v.value(), Float::type(), i)));
3402 		out = V(Nucleus::createInsertElement(V(out), V(el), i));
3403 	}
3404 	return RValue<Float4>(V(out));
3405 }
3406 
Asin(RValue<Float4> v,Precision p)3407 RValue<Float4> Asin(RValue<Float4> v, Precision p)
3408 {
3409 	RR_DEBUG_INFO_UPDATE_LOC();
3410 	return TransformFloat4PerElement(v, "asinf");
3411 }
3412 
Acos(RValue<Float4> v,Precision p)3413 RValue<Float4> Acos(RValue<Float4> v, Precision p)
3414 {
3415 	RR_DEBUG_INFO_UPDATE_LOC();
3416 	return TransformFloat4PerElement(v, "acosf");
3417 }
3418 
Atan(RValue<Float4> v)3419 RValue<Float4> Atan(RValue<Float4> v)
3420 {
3421 	RR_DEBUG_INFO_UPDATE_LOC();
3422 	return TransformFloat4PerElement(v, "atanf");
3423 }
3424 
Sinh(RValue<Float4> v)3425 RValue<Float4> Sinh(RValue<Float4> v)
3426 {
3427 	RR_DEBUG_INFO_UPDATE_LOC();
3428 	return emulated::Sinh(v);
3429 }
3430 
Cosh(RValue<Float4> v)3431 RValue<Float4> Cosh(RValue<Float4> v)
3432 {
3433 	RR_DEBUG_INFO_UPDATE_LOC();
3434 	return emulated::Cosh(v);
3435 }
3436 
Tanh(RValue<Float4> v)3437 RValue<Float4> Tanh(RValue<Float4> v)
3438 {
3439 	RR_DEBUG_INFO_UPDATE_LOC();
3440 	return TransformFloat4PerElement(v, "tanhf");
3441 }
3442 
Asinh(RValue<Float4> v)3443 RValue<Float4> Asinh(RValue<Float4> v)
3444 {
3445 	RR_DEBUG_INFO_UPDATE_LOC();
3446 	return TransformFloat4PerElement(v, "asinhf");
3447 }
3448 
Acosh(RValue<Float4> v)3449 RValue<Float4> Acosh(RValue<Float4> v)
3450 {
3451 	RR_DEBUG_INFO_UPDATE_LOC();
3452 	return TransformFloat4PerElement(v, "acoshf");
3453 }
3454 
Atanh(RValue<Float4> v)3455 RValue<Float4> Atanh(RValue<Float4> v)
3456 {
3457 	RR_DEBUG_INFO_UPDATE_LOC();
3458 	return TransformFloat4PerElement(v, "atanhf");
3459 }
3460 
Atan2(RValue<Float4> x,RValue<Float4> y)3461 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
3462 {
3463 	RR_DEBUG_INFO_UPDATE_LOC();
3464 	llvm::SmallVector<llvm::Type *, 2> paramTys;
3465 	paramTys.push_back(T(Float::type()));
3466 	paramTys.push_back(T(Float::type()));
3467 	auto funcTy = llvm::FunctionType::get(T(Float::type()), paramTys, false);
3468 	auto func = jit->module->getOrInsertFunction("atan2f", funcTy);
3469 	llvm::Value *out = llvm::UndefValue::get(T(Float4::type()));
3470 	for(uint64_t i = 0; i < 4; i++)
3471 	{
3472 		auto el = jit->builder->CreateCall(func, { V(Nucleus::createExtractElement(x.value(), Float::type(), i)),
3473 		                                           V(Nucleus::createExtractElement(y.value(), Float::type(), i)) });
3474 		out = V(Nucleus::createInsertElement(V(out), V(el), i));
3475 	}
3476 	return RValue<Float4>(V(out));
3477 }
3478 
Pow(RValue<Float4> x,RValue<Float4> y)3479 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
3480 {
3481 	RR_DEBUG_INFO_UPDATE_LOC();
3482 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::pow, { T(Float4::type()) });
3483 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()) })));
3484 }
3485 
Exp(RValue<Float4> v)3486 RValue<Float4> Exp(RValue<Float4> v)
3487 {
3488 	RR_DEBUG_INFO_UPDATE_LOC();
3489 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp, { T(Float4::type()) });
3490 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3491 }
3492 
Log(RValue<Float4> v)3493 RValue<Float4> Log(RValue<Float4> v)
3494 {
3495 	RR_DEBUG_INFO_UPDATE_LOC();
3496 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log, { T(Float4::type()) });
3497 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3498 }
3499 
Exp2(RValue<Float4> v)3500 RValue<Float4> Exp2(RValue<Float4> v)
3501 {
3502 	RR_DEBUG_INFO_UPDATE_LOC();
3503 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::exp2, { T(Float4::type()) });
3504 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3505 }
3506 
Log2(RValue<Float4> v)3507 RValue<Float4> Log2(RValue<Float4> v)
3508 {
3509 	RR_DEBUG_INFO_UPDATE_LOC();
3510 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::log2, { T(Float4::type()) });
3511 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(v.value()))));
3512 }
3513 
Ctlz(RValue<UInt> v,bool isZeroUndef)3514 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3515 {
3516 	RR_DEBUG_INFO_UPDATE_LOC();
3517 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3518 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3519 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3520 }
3521 
Ctlz(RValue<UInt4> v,bool isZeroUndef)3522 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3523 {
3524 	RR_DEBUG_INFO_UPDATE_LOC();
3525 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3526 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3527 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3528 }
3529 
Cttz(RValue<UInt> v,bool isZeroUndef)3530 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3531 {
3532 	RR_DEBUG_INFO_UPDATE_LOC();
3533 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3534 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3535 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3536 }
3537 
Cttz(RValue<UInt4> v,bool isZeroUndef)3538 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3539 {
3540 	RR_DEBUG_INFO_UPDATE_LOC();
3541 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3542 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3543 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3544 }
3545 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3546 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3547 {
3548 	return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3549 }
3550 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3551 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3552 {
3553 	return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3554 }
3555 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3556 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3557 {
3558 	return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3559 }
3560 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3561 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3562 {
3563 	return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3564 }
3565 
type()3566 Type *Float4::type()
3567 {
3568 	return T(llvm::VectorType::get(T(Float::type()), 4, false));
3569 }
3570 
Ticks()3571 RValue<Long> Ticks()
3572 {
3573 	RR_DEBUG_INFO_UPDATE_LOC();
3574 	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3575 
3576 	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3577 }
3578 
ConstantPointer(void const * ptr)3579 RValue<Pointer<Byte>> ConstantPointer(void const *ptr)
3580 {
3581 	RR_DEBUG_INFO_UPDATE_LOC();
3582 	// Note: this should work for 32-bit pointers as well because 'inttoptr'
3583 	// is defined to truncate (and zero extend) if necessary.
3584 	auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3585 	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3586 }
3587 
ConstantData(void const * data,size_t size)3588 RValue<Pointer<Byte>> ConstantData(void const *data, size_t size)
3589 {
3590 	RR_DEBUG_INFO_UPDATE_LOC();
3591 	auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3592 	auto ptr = jit->builder->CreateGlobalStringPtr(str);
3593 	return RValue<Pointer<Byte>>(V(ptr));
3594 }
3595 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3596 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3597 {
3598 	RR_DEBUG_INFO_UPDATE_LOC();
3599 	llvm::SmallVector<llvm::Type *, 8> paramTys;
3600 	for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3601 	auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3602 
3603 	auto funcPtrTy = funcTy->getPointerTo();
3604 	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3605 
3606 	llvm::SmallVector<llvm::Value *, 8> arguments;
3607 	for(auto arg : args) { arguments.push_back(V(arg)); }
3608 	return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3609 }
3610 
Breakpoint()3611 void Breakpoint()
3612 {
3613 	RR_DEBUG_INFO_UPDATE_LOC();
3614 	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3615 
3616 	jit->builder->CreateCall(debugtrap);
3617 }
3618 
3619 }  // namespace rr
3620 
3621 namespace rr {
3622 
3623 #if defined(__i386__) || defined(__x86_64__)
3624 namespace x86 {
3625 
3626 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3627 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3628 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3629 {
3630 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3631 
3632 	return V(jit->builder->CreateCall(intrinsic, V(x)));
3633 }
3634 
3635 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3636 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3637 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3638 {
3639 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3640 
3641 	return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3642 }
3643 
cvtss2si(RValue<Float> val)3644 RValue<Int> cvtss2si(RValue<Float> val)
3645 {
3646 	Float4 vector;
3647 	vector.x = val;
3648 
3649 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3650 }
3651 
cvtps2dq(RValue<Float4> val)3652 RValue<Int4> cvtps2dq(RValue<Float4> val)
3653 {
3654 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3655 }
3656 
rcpss(RValue<Float> val)3657 RValue<Float> rcpss(RValue<Float> val)
3658 {
3659 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3660 
3661 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3662 }
3663 
sqrtss(RValue<Float> val)3664 RValue<Float> sqrtss(RValue<Float> val)
3665 {
3666 	return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3667 }
3668 
rsqrtss(RValue<Float> val)3669 RValue<Float> rsqrtss(RValue<Float> val)
3670 {
3671 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3672 
3673 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3674 }
3675 
rcpps(RValue<Float4> val)3676 RValue<Float4> rcpps(RValue<Float4> val)
3677 {
3678 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3679 }
3680 
sqrtps(RValue<Float4> val)3681 RValue<Float4> sqrtps(RValue<Float4> val)
3682 {
3683 	return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3684 }
3685 
rsqrtps(RValue<Float4> val)3686 RValue<Float4> rsqrtps(RValue<Float4> val)
3687 {
3688 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3689 }
3690 
maxps(RValue<Float4> x,RValue<Float4> y)3691 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3692 {
3693 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3694 }
3695 
minps(RValue<Float4> x,RValue<Float4> y)3696 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3697 {
3698 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3699 }
3700 
roundss(RValue<Float> val,unsigned char imm)3701 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3702 {
3703 	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3704 
3705 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3706 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3707 
3708 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3709 }
3710 
floorss(RValue<Float> val)3711 RValue<Float> floorss(RValue<Float> val)
3712 {
3713 	return roundss(val, 1);
3714 }
3715 
ceilss(RValue<Float> val)3716 RValue<Float> ceilss(RValue<Float> val)
3717 {
3718 	return roundss(val, 2);
3719 }
3720 
roundps(RValue<Float4> val,unsigned char imm)3721 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3722 {
3723 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3724 }
3725 
floorps(RValue<Float4> val)3726 RValue<Float4> floorps(RValue<Float4> val)
3727 {
3728 	return roundps(val, 1);
3729 }
3730 
ceilps(RValue<Float4> val)3731 RValue<Float4> ceilps(RValue<Float4> val)
3732 {
3733 	return roundps(val, 2);
3734 }
3735 
pabsd(RValue<Int4> x)3736 RValue<Int4> pabsd(RValue<Int4> x)
3737 {
3738 	return RValue<Int4>(V(lowerPABS(V(x.value()))));
3739 }
3740 
paddsw(RValue<Short4> x,RValue<Short4> y)3741 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3742 {
3743 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3744 }
3745 
psubsw(RValue<Short4> x,RValue<Short4> y)3746 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3747 {
3748 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3749 }
3750 
paddusw(RValue<UShort4> x,RValue<UShort4> y)3751 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3752 {
3753 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3754 }
3755 
psubusw(RValue<UShort4> x,RValue<UShort4> y)3756 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3757 {
3758 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3759 }
3760 
paddsb(RValue<SByte8> x,RValue<SByte8> y)3761 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3762 {
3763 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3764 }
3765 
psubsb(RValue<SByte8> x,RValue<SByte8> y)3766 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3767 {
3768 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3769 }
3770 
paddusb(RValue<Byte8> x,RValue<Byte8> y)3771 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3772 {
3773 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3774 }
3775 
psubusb(RValue<Byte8> x,RValue<Byte8> y)3776 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3777 {
3778 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3779 }
3780 
pavgw(RValue<UShort4> x,RValue<UShort4> y)3781 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3782 {
3783 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3784 }
3785 
pmaxsw(RValue<Short4> x,RValue<Short4> y)3786 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3787 {
3788 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3789 }
3790 
pminsw(RValue<Short4> x,RValue<Short4> y)3791 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3792 {
3793 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3794 }
3795 
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3796 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3797 {
3798 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3799 }
3800 
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3801 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3802 {
3803 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3804 }
3805 
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3806 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3807 {
3808 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3809 }
3810 
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3811 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3812 {
3813 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3814 }
3815 
packssdw(RValue<Int2> x,RValue<Int2> y)3816 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3817 {
3818 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3819 }
3820 
packssdw(RValue<Int4> x,RValue<Int4> y)3821 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3822 {
3823 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3824 }
3825 
packsswb(RValue<Short4> x,RValue<Short4> y)3826 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3827 {
3828 	return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3829 }
3830 
packuswb(RValue<Short4> x,RValue<Short4> y)3831 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3832 {
3833 	return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3834 }
3835 
packusdw(RValue<Int4> x,RValue<Int4> y)3836 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3837 {
3838 	if(CPUID::supportsSSE4_1())
3839 	{
3840 		return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3841 	}
3842 	else
3843 	{
3844 		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3845 		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3846 
3847 		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3848 	}
3849 }
3850 
psrlw(RValue<UShort4> x,unsigned char y)3851 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3852 {
3853 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3854 }
3855 
psrlw(RValue<UShort8> x,unsigned char y)3856 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3857 {
3858 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3859 }
3860 
psraw(RValue<Short4> x,unsigned char y)3861 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3862 {
3863 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3864 }
3865 
psraw(RValue<Short8> x,unsigned char y)3866 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3867 {
3868 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3869 }
3870 
psllw(RValue<Short4> x,unsigned char y)3871 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3872 {
3873 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3874 }
3875 
psllw(RValue<Short8> x,unsigned char y)3876 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3877 {
3878 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3879 }
3880 
pslld(RValue<Int2> x,unsigned char y)3881 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3882 {
3883 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3884 }
3885 
pslld(RValue<Int4> x,unsigned char y)3886 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3887 {
3888 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3889 }
3890 
psrad(RValue<Int2> x,unsigned char y)3891 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3892 {
3893 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3894 }
3895 
psrad(RValue<Int4> x,unsigned char y)3896 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3897 {
3898 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3899 }
3900 
psrld(RValue<UInt2> x,unsigned char y)3901 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3902 {
3903 	return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3904 }
3905 
psrld(RValue<UInt4> x,unsigned char y)3906 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3907 {
3908 	return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3909 }
3910 
pmaxsd(RValue<Int4> x,RValue<Int4> y)3911 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3912 {
3913 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3914 }
3915 
pminsd(RValue<Int4> x,RValue<Int4> y)3916 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3917 {
3918 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3919 }
3920 
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3921 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3922 {
3923 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3924 }
3925 
pminud(RValue<UInt4> x,RValue<UInt4> y)3926 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3927 {
3928 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3929 }
3930 
pmulhw(RValue<Short4> x,RValue<Short4> y)3931 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3932 {
3933 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3934 }
3935 
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3936 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3937 {
3938 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3939 }
3940 
pmaddwd(RValue<Short4> x,RValue<Short4> y)3941 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3942 {
3943 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3944 }
3945 
pmulhw(RValue<Short8> x,RValue<Short8> y)3946 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3947 {
3948 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3949 }
3950 
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3951 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3952 {
3953 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3954 }
3955 
pmaddwd(RValue<Short8> x,RValue<Short8> y)3956 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3957 {
3958 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3959 }
3960 
movmskps(RValue<Float4> x)3961 RValue<Int> movmskps(RValue<Float4> x)
3962 {
3963 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, x.value()));
3964 }
3965 
pmovmskb(RValue<Byte8> x)3966 RValue<Int> pmovmskb(RValue<Byte8> x)
3967 {
3968 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, x.value())) & 0xFF;
3969 }
3970 
pmovzxbd(RValue<Byte16> x)3971 RValue<Int4> pmovzxbd(RValue<Byte16> x)
3972 {
3973 	return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), false)));
3974 }
3975 
pmovsxbd(RValue<SByte16> x)3976 RValue<Int4> pmovsxbd(RValue<SByte16> x)
3977 {
3978 	return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), true)));
3979 }
3980 
pmovzxwd(RValue<UShort8> x)3981 RValue<Int4> pmovzxwd(RValue<UShort8> x)
3982 {
3983 	return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), false)));
3984 }
3985 
pmovsxwd(RValue<Short8> x)3986 RValue<Int4> pmovsxwd(RValue<Short8> x)
3987 {
3988 	return RValue<Int4>(V(lowerPMOV(V(x.value()), T(Int4::type()), true)));
3989 }
3990 
3991 }  // namespace x86
3992 #endif  // defined(__i386__) || defined(__x86_64__)
3993 
3994 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)3995 void VPrintf(const std::vector<Value *> &vals)
3996 {
3997 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3998 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3999 	auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
4000 	auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
4001 	jit->builder->CreateCall(func, V(vals));
4002 }
4003 #endif  // ENABLE_RR_PRINT
4004 
Nop()4005 void Nop()
4006 {
4007 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4008 	auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
4009 	auto func = jit->module->getOrInsertFunction("nop", funcTy);
4010 	jit->builder->CreateCall(func);
4011 }
4012 
EmitDebugLocation()4013 void EmitDebugLocation()
4014 {
4015 #ifdef ENABLE_RR_DEBUG_INFO
4016 	if(jit->debugInfo != nullptr)
4017 	{
4018 		jit->debugInfo->EmitLocation();
4019 	}
4020 #endif  // ENABLE_RR_DEBUG_INFO
4021 }
4022 
EmitDebugVariable(Value * value)4023 void EmitDebugVariable(Value *value)
4024 {
4025 #ifdef ENABLE_RR_DEBUG_INFO
4026 	if(jit->debugInfo != nullptr)
4027 	{
4028 		jit->debugInfo->EmitVariable(value);
4029 	}
4030 #endif  // ENABLE_RR_DEBUG_INFO
4031 }
4032 
FlushDebug()4033 void FlushDebug()
4034 {
4035 #ifdef ENABLE_RR_DEBUG_INFO
4036 	if(jit->debugInfo != nullptr)
4037 	{
4038 		jit->debugInfo->Flush();
4039 	}
4040 #endif  // ENABLE_RR_DEBUG_INFO
4041 }
4042 
4043 }  // namespace rr
4044 
4045 // ------------------------------  Coroutines ------------------------------
4046 
4047 namespace {
4048 
4049 // Magic values retuned by llvm.coro.suspend.
4050 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
4051 enum SuspendAction
4052 {
4053 	SuspendActionSuspend = -1,
4054 	SuspendActionResume = 0,
4055 	SuspendActionDestroy = 1
4056 };
4057 
promoteFunctionToCoroutine()4058 void promoteFunctionToCoroutine()
4059 {
4060 	ASSERT(jit->coroutine.id == nullptr);
4061 
4062 	// Types
4063 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4064 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4065 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4066 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
4067 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4068 	auto promiseTy = jit->coroutine.yieldType;
4069 	auto promisePtrTy = promiseTy->getPointerTo();
4070 
4071 	// LLVM intrinsics
4072 	auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
4073 	auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
4074 	auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
4075 	auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
4076 	auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
4077 	auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
4078 	auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
4079 	auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
4080 	auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
4081 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4082 
4083 	auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
4084 	auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
4085 	auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
4086 	auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
4087 
4088 	auto oldInsertionPoint = jit->builder->saveIP();
4089 
4090 	// Build the coroutine_await() function:
4091 	//
4092 	//    bool coroutine_await(CoroutineHandle* handle, YieldType* out)
4093 	//    {
4094 	//        if(llvm.coro.done(handle))
4095 	//        {
4096 	//            return false;
4097 	//        }
4098 	//        else
4099 	//        {
4100 	//            *value = (T*)llvm.coro.promise(handle);
4101 	//            llvm.coro.resume(handle);
4102 	//            return true;
4103 	//        }
4104 	//    }
4105 	//
4106 	{
4107 		auto args = jit->coroutine.await->arg_begin();
4108 		auto handle = args++;
4109 		auto outPtr = args++;
4110 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
4111 		auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
4112 		auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
4113 
4114 		auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
4115 		jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
4116 
4117 		jit->builder->SetInsertPoint(doneBlock);
4118 		jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
4119 
4120 		jit->builder->SetInsertPoint(resumeBlock);
4121 		auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4);  // TODO: Get correct alignment.
4122 		auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
4123 		auto promise = jit->builder->CreateLoad(jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
4124 		jit->builder->CreateStore(promise, outPtr);
4125 		jit->builder->CreateCall(coro_resume, { handle });
4126 		jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
4127 	}
4128 
4129 	// Build the coroutine_destroy() function:
4130 	//
4131 	//    void coroutine_destroy(CoroutineHandle* handle)
4132 	//    {
4133 	//        llvm.coro.destroy(handle);
4134 	//    }
4135 	//
4136 	{
4137 		auto handle = jit->coroutine.destroy->arg_begin();
4138 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4139 		jit->builder->CreateCall(coro_destroy, { handle });
4140 		jit->builder->CreateRetVoid();
4141 	}
4142 
4143 	// Begin building the main coroutine_begin() function.
4144 	//
4145 	//    CoroutineHandle* coroutine_begin(<Arguments>)
4146 	//    {
4147 	//        YieldType promise;
4148 	//        auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
4149 	//        void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
4150 	//        CoroutineHandle *handle = llvm.coro.begin(id, frame);
4151 	//
4152 	//        ... <REACTOR CODE> ...
4153 	//
4154 	//    end:
4155 	//        SuspendAction action = llvm.coro.suspend(none, true /* final */);  // <-- RESUME POINT
4156 	//        switch(action)
4157 	//        {
4158 	//        case SuspendActionResume:
4159 	//            UNREACHABLE(); // Illegal to resume after final suspend.
4160 	//        case SuspendActionDestroy:
4161 	//            goto destroy;
4162 	//        default: // (SuspendActionSuspend)
4163 	//            goto suspend;
4164 	//        }
4165 	//
4166 	//    destroy:
4167 	//        coroutine_free_frame(llvm.coro.free(id, handle));
4168 	//        goto suspend;
4169 	//
4170 	//    suspend:
4171 	//        llvm.coro.end(handle, false);
4172 	//        return handle;
4173 	//    }
4174 	//
4175 
4176 #ifdef ENABLE_RR_DEBUG_INFO
4177 	jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
4178 #endif  // ENABLE_RR_DEBUG_INFO
4179 
4180 	jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
4181 	jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
4182 	jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
4183 
4184 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
4185 	jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
4186 	jit->coroutine.id = jit->builder->CreateCall(coro_id, {
4187 	                                                          llvm::ConstantInt::get(i32Ty, 0),
4188 	                                                          jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
4189 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
4190 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
4191 	                                                      });
4192 	auto size = jit->builder->CreateCall(coro_size, {});
4193 	auto frame = jit->builder->CreateCall(allocFrame, { size });
4194 	jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
4195 
4196 	// Build the suspend block
4197 	jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
4198 	jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
4199 	jit->builder->CreateRet(jit->coroutine.handle);
4200 
4201 	// Build the end block
4202 	jit->builder->SetInsertPoint(jit->coroutine.endBlock);
4203 	auto action = jit->builder->CreateCall(coro_suspend, {
4204 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4205 	                                                         llvm::ConstantInt::get(i1Ty, 1),  // final: true
4206 	                                                     });
4207 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4208 	// switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
4209 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4210 
4211 	// Build the destroy block
4212 	jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4213 	auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4214 	jit->builder->CreateCall(freeFrame, { memory });
4215 	jit->builder->CreateBr(jit->coroutine.suspendBlock);
4216 
4217 	// Switch back to original insert point to continue building the coroutine.
4218 	jit->builder->restoreIP(oldInsertionPoint);
4219 }
4220 
4221 }  // anonymous namespace
4222 
4223 namespace rr {
4224 
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4225 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4226 {
4227 	// Coroutines are initially created as a regular function.
4228 	// Upon the first call to Yield(), the function is promoted to a true
4229 	// coroutine.
4230 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4231 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4232 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4233 	auto handleTy = i8PtrTy;
4234 	auto boolTy = i1Ty;
4235 	auto promiseTy = T(YieldType);
4236 	auto promisePtrTy = promiseTy->getPointerTo();
4237 
4238 	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4239 	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4240 	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4241 	jit->coroutine.yieldType = promiseTy;
4242 	jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4243 
4244 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4245 }
4246 
yield(Value * val)4247 void Nucleus::yield(Value *val)
4248 {
4249 	if(jit->coroutine.id == nullptr)
4250 	{
4251 		// First call to yield().
4252 		// Promote the function to a full coroutine.
4253 		promoteFunctionToCoroutine();
4254 		ASSERT(jit->coroutine.id != nullptr);
4255 	}
4256 
4257 	//      promise = val;
4258 	//
4259 	//      auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4260 	//      switch(action)
4261 	//      {
4262 	//      case SuspendActionResume:
4263 	//          goto resume;
4264 	//      case SuspendActionDestroy:
4265 	//          goto destroy;
4266 	//      default: // (SuspendActionSuspend)
4267 	//          goto suspend;
4268 	//      }
4269 	//  resume:
4270 	//
4271 
4272 	RR_DEBUG_INFO_UPDATE_LOC();
4273 	Variable::materializeAll();
4274 
4275 	// Types
4276 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4277 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4278 
4279 	// Intrinsics
4280 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4281 
4282 	// Create a block to resume execution.
4283 	auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4284 
4285 	// Store the promise (yield value)
4286 	jit->builder->CreateStore(V(val), jit->coroutine.promise);
4287 	auto action = jit->builder->CreateCall(coro_suspend, {
4288 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4289 	                                                         llvm::ConstantInt::get(i1Ty, 0),  // final: true
4290 	                                                     });
4291 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4292 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4293 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4294 
4295 	// Continue building in the resume block.
4296 	jit->builder->SetInsertPoint(resumeBlock);
4297 }
4298 
acquireCoroutine(const char * name,const Config::Edit & cfgEdit)4299 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name, const Config::Edit &cfgEdit /* = Config::Edit::None */)
4300 {
4301 	bool isCoroutine = jit->coroutine.id != nullptr;
4302 	if(isCoroutine)
4303 	{
4304 		jit->builder->CreateBr(jit->coroutine.endBlock);
4305 	}
4306 	else
4307 	{
4308 		// Coroutine without a Yield acts as a regular function.
4309 		// The 'coroutine_begin' function returns a nullptr for the coroutine
4310 		// handle.
4311 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4312 		// The 'coroutine_await' function always returns false (coroutine done).
4313 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4314 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4315 		// The 'coroutine_destroy' does nothing, returns void.
4316 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4317 		jit->builder->CreateRetVoid();
4318 	}
4319 
4320 #ifdef ENABLE_RR_DEBUG_INFO
4321 	if(jit->debugInfo != nullptr)
4322 	{
4323 		jit->debugInfo->Finalize();
4324 	}
4325 #endif  // ENABLE_RR_DEBUG_INFO
4326 
4327 	if(false)
4328 	{
4329 		std::error_code error;
4330 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4331 		jit->module->print(file, 0);
4332 	}
4333 
4334 	if(isCoroutine)
4335 	{
4336 		// Run manadory coroutine transforms.
4337 		llvm::legacy::PassManager pm;
4338 
4339 		pm.add(llvm::createCoroEarlyLegacyPass());
4340 		pm.add(llvm::createCoroSplitLegacyPass());
4341 		pm.add(llvm::createCoroElideLegacyPass());
4342 		pm.add(llvm::createBarrierNoopPass());
4343 		pm.add(llvm::createCoroCleanupLegacyPass());
4344 
4345 		pm.run(*jit->module);
4346 	}
4347 
4348 #if defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4349 	{
4350 		llvm::legacy::PassManager pm;
4351 		pm.add(llvm::createVerifierPass());
4352 		pm.run(*jit->module);
4353 	}
4354 #endif  // defined(ENABLE_RR_LLVM_IR_VERIFICATION) || !defined(NDEBUG)
4355 
4356 	auto cfg = cfgEdit.apply(jit->config);
4357 	jit->optimize(cfg);
4358 
4359 	if(false)
4360 	{
4361 		std::error_code error;
4362 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4363 		jit->module->print(file, 0);
4364 	}
4365 
4366 	llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4367 	funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4368 	funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4369 	funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4370 
4371 	auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount, cfg);
4372 
4373 	delete jit;
4374 	jit = nullptr;
4375 
4376 	return routine;
4377 }
4378 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4379 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4380 {
4381 	return func();
4382 }
4383 
4384 }  // namespace rr
4385