1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "VertexRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SpirvShader.hpp"
19 #include "Device/Renderer.hpp"
20 #include "Device/Vertex.hpp"
21 #include "System/Debug.hpp"
22 #include "System/Half.hpp"
23 
24 namespace sw {
25 
VertexRoutine(const VertexProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader)26 VertexRoutine::VertexRoutine(
27     const VertexProcessor::State &state,
28     vk::PipelineLayout const *pipelineLayout,
29     SpirvShader const *spirvShader)
30     : routine(pipelineLayout)
31     , state(state)
32     , spirvShader(spirvShader)
33 {
34 	spirvShader->emitProlog(&routine);
35 }
36 
~VertexRoutine()37 VertexRoutine::~VertexRoutine()
38 {
39 }
40 
generate()41 void VertexRoutine::generate()
42 {
43 	Pointer<Byte> cache = task + OFFSET(VertexTask, vertexCache);
44 	Pointer<Byte> vertexCache = cache + OFFSET(VertexCache, vertex);
45 	Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache, tag));
46 
47 	UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask, vertexCount));
48 
49 	constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, constants));
50 
51 	// Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
52 	// On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
53 	// in reverse order to guarantee that the first one doesn't get evicted and can be written out.
54 
55 	Do
56 	{
57 		UInt index = *batch;
58 		UInt cacheIndex = index & VertexCache::TAG_MASK;
59 
60 		If(tagCache[cacheIndex] != index)
61 		{
62 			readInput(batch);
63 			program(batch, vertexCount);
64 			computeClipFlags();
65 			computeCullMask();
66 
67 			writeCache(vertexCache, tagCache, batch);
68 		}
69 
70 		Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
71 
72 		// For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
73 		for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
74 		{
75 			writeVertex(vertex, cacheEntry);
76 			vertex += sizeof(Vertex);
77 		}
78 
79 		batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
80 		vertexCount--;
81 	}
82 	Until(vertexCount == 0);
83 
84 	Return();
85 }
86 
readInput(Pointer<UInt> & batch)87 void VertexRoutine::readInput(Pointer<UInt> &batch)
88 {
89 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
90 	{
91 		if(spirvShader->inputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
92 		   spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
93 		   spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
94 		   spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
95 		{
96 			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void *) * (i / 4));
97 			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
98 			Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
99 			UInt robustnessSize(0);
100 			if(state.robustBufferAccess)
101 			{
102 				robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
103 			}
104 
105 			auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
106 			routine.inputs[i + 0] = value.x;
107 			routine.inputs[i + 1] = value.y;
108 			routine.inputs[i + 2] = value.z;
109 			routine.inputs[i + 3] = value.w;
110 		}
111 	}
112 }
113 
computeClipFlags()114 void VertexRoutine::computeClipFlags()
115 {
116 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
117 	if(it != spirvShader->outputBuiltins.end())
118 	{
119 		assert(it->second.SizeInComponents == 4);
120 		auto &pos = routine.getVariable(it->second.Id);
121 		auto posX = pos[it->second.FirstComponent + 0];
122 		auto posY = pos[it->second.FirstComponent + 1];
123 		auto posZ = pos[it->second.FirstComponent + 2];
124 		auto posW = pos[it->second.FirstComponent + 3];
125 
126 		Int4 maxX = CmpLT(posW, posX);
127 		Int4 maxY = CmpLT(posW, posY);
128 		Int4 maxZ = CmpLT(posW, posZ);
129 		Int4 minX = CmpNLE(-posW, posX);
130 		Int4 minY = CmpNLE(-posW, posY);
131 		Int4 minZ = CmpNLE(Float4(0.0f), posZ);
132 
133 		clipFlags = Pointer<Int>(constants + OFFSET(Constants, maxX))[SignMask(maxX)];
134 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxY))[SignMask(maxY)];
135 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxZ))[SignMask(maxZ)];
136 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minX))[SignMask(minX)];
137 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minY))[SignMask(minY)];
138 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minZ))[SignMask(minZ)];
139 
140 		Float4 maxPos = As<Float4>(Int4(0x7F7FFFFF));
141 		Int4 finiteX = CmpLE(Abs(posX), maxPos);
142 		Int4 finiteY = CmpLE(Abs(posY), maxPos);
143 		Int4 finiteZ = CmpLE(Abs(posZ), maxPos);
144 
145 		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
146 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, fini))[SignMask(finiteXYZ)];
147 	}
148 }
149 
computeCullMask()150 void VertexRoutine::computeCullMask()
151 {
152 	cullMask = Int(15);
153 
154 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
155 	if(it != spirvShader->outputBuiltins.end())
156 	{
157 		auto count = spirvShader->getNumOutputCullDistances();
158 		for(uint32_t i = 0; i < count; i++)
159 		{
160 			auto const &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
161 			auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
162 			cullMask &= mask;
163 		}
164 	}
165 }
166 
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,Pointer<UInt> & batch,bool robustBufferAccess,UInt & robustnessSize,Int baseVertex)167 Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
168                                    bool robustBufferAccess, UInt &robustnessSize, Int baseVertex)
169 {
170 	Vector4f v;
171 	// Because of the following rule in the Vulkan spec, we do not care if a very large negative
172 	// baseVertex would overflow all the way back into a valid region of the index buffer:
173 	// "Out-of-bounds buffer loads will return any of the following values :
174 	//  - Values from anywhere within the memory range(s) bound to the buffer (possibly including
175 	//    bytes of memory past the end of the buffer, up to the end of the bound range)."
176 	UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
177 
178 	Pointer<Byte> source0 = buffer + offsets.x;
179 	Pointer<Byte> source1 = buffer + offsets.y;
180 	Pointer<Byte> source2 = buffer + offsets.z;
181 	Pointer<Byte> source3 = buffer + offsets.w;
182 
183 	vk::Format format(stream.format);
184 
185 	UInt4 zero(0);
186 	if(robustBufferAccess)
187 	{
188 		// TODO(b/141124876): Optimize for wide-vector gather operations.
189 		UInt4 limits = offsets + UInt4(format.bytes());
190 		Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
191 		source0 = IfThenElse(limits.x <= robustnessSize, source0, zeroSource);
192 		source1 = IfThenElse(limits.y <= robustnessSize, source1, zeroSource);
193 		source2 = IfThenElse(limits.z <= robustnessSize, source2, zeroSource);
194 		source3 = IfThenElse(limits.w <= robustnessSize, source3, zeroSource);
195 	}
196 
197 	int componentCount = format.componentCount();
198 	bool normalized = !format.isUnnormalizedInteger();
199 	bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || normalized;
200 	bool bgra = false;
201 
202 	switch(stream.format)
203 	{
204 		case VK_FORMAT_R32_SFLOAT:
205 		case VK_FORMAT_R32G32_SFLOAT:
206 		case VK_FORMAT_R32G32B32_SFLOAT:
207 		case VK_FORMAT_R32G32B32A32_SFLOAT:
208 		{
209 			if(componentCount == 0)
210 			{
211 				// Null stream, all default components
212 			}
213 			else
214 			{
215 				if(componentCount == 1)
216 				{
217 					v.x.x = *Pointer<Float>(source0);
218 					v.x.y = *Pointer<Float>(source1);
219 					v.x.z = *Pointer<Float>(source2);
220 					v.x.w = *Pointer<Float>(source3);
221 				}
222 				else
223 				{
224 					v.x = *Pointer<Float4>(source0);
225 					v.y = *Pointer<Float4>(source1);
226 					v.z = *Pointer<Float4>(source2);
227 					v.w = *Pointer<Float4>(source3);
228 
229 					transpose4xN(v.x, v.y, v.z, v.w, componentCount);
230 				}
231 			}
232 		}
233 		break;
234 		case VK_FORMAT_B8G8R8A8_UNORM:
235 			bgra = true;
236 			// [[fallthrough]]
237 		case VK_FORMAT_R8_UNORM:
238 		case VK_FORMAT_R8G8_UNORM:
239 		case VK_FORMAT_R8G8B8A8_UNORM:
240 		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
241 			v.x = Float4(*Pointer<Byte4>(source0));
242 			v.y = Float4(*Pointer<Byte4>(source1));
243 			v.z = Float4(*Pointer<Byte4>(source2));
244 			v.w = Float4(*Pointer<Byte4>(source3));
245 
246 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
247 
248 			if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
249 			if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
250 			if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
251 			if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
252 			break;
253 		case VK_FORMAT_R8_UINT:
254 		case VK_FORMAT_R8G8_UINT:
255 		case VK_FORMAT_R8G8B8A8_UINT:
256 		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
257 			v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
258 			v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
259 			v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
260 			v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
261 
262 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
263 			break;
264 		case VK_FORMAT_R8_SNORM:
265 		case VK_FORMAT_R8G8_SNORM:
266 		case VK_FORMAT_R8G8B8A8_SNORM:
267 		case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
268 			v.x = Float4(*Pointer<SByte4>(source0));
269 			v.y = Float4(*Pointer<SByte4>(source1));
270 			v.z = Float4(*Pointer<SByte4>(source2));
271 			v.w = Float4(*Pointer<SByte4>(source3));
272 
273 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
274 
275 			if(componentCount >= 1) v.x = Max(v.x * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
276 			if(componentCount >= 2) v.y = Max(v.y * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
277 			if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
278 			if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
279 			break;
280 		case VK_FORMAT_R8_SINT:
281 		case VK_FORMAT_R8G8_SINT:
282 		case VK_FORMAT_R8G8B8A8_SINT:
283 		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
284 			v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
285 			v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
286 			v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
287 			v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
288 
289 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
290 			break;
291 		case VK_FORMAT_R16_SNORM:
292 		case VK_FORMAT_R16G16_SNORM:
293 		case VK_FORMAT_R16G16B16A16_SNORM:
294 			v.x = Float4(*Pointer<Short4>(source0));
295 			v.y = Float4(*Pointer<Short4>(source1));
296 			v.z = Float4(*Pointer<Short4>(source2));
297 			v.w = Float4(*Pointer<Short4>(source3));
298 
299 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
300 
301 			if(componentCount >= 1) v.x = Max(v.x * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
302 			if(componentCount >= 2) v.y = Max(v.y * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
303 			if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
304 			if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
305 			break;
306 		case VK_FORMAT_R16_SINT:
307 		case VK_FORMAT_R16G16_SINT:
308 		case VK_FORMAT_R16G16B16A16_SINT:
309 			v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
310 			v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
311 			v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
312 			v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
313 
314 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
315 			break;
316 		case VK_FORMAT_R16_UNORM:
317 		case VK_FORMAT_R16G16_UNORM:
318 		case VK_FORMAT_R16G16B16A16_UNORM:
319 			v.x = Float4(*Pointer<UShort4>(source0));
320 			v.y = Float4(*Pointer<UShort4>(source1));
321 			v.z = Float4(*Pointer<UShort4>(source2));
322 			v.w = Float4(*Pointer<UShort4>(source3));
323 
324 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
325 
326 			if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
327 			if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
328 			if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
329 			if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
330 			break;
331 		case VK_FORMAT_R16_UINT:
332 		case VK_FORMAT_R16G16_UINT:
333 		case VK_FORMAT_R16G16B16A16_UINT:
334 			v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
335 			v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
336 			v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
337 			v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
338 
339 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
340 			break;
341 		case VK_FORMAT_R32_SINT:
342 		case VK_FORMAT_R32G32_SINT:
343 		case VK_FORMAT_R32G32B32_SINT:
344 		case VK_FORMAT_R32G32B32A32_SINT:
345 			v.x = *Pointer<Float4>(source0);
346 			v.y = *Pointer<Float4>(source1);
347 			v.z = *Pointer<Float4>(source2);
348 			v.w = *Pointer<Float4>(source3);
349 
350 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
351 			break;
352 		case VK_FORMAT_R32_UINT:
353 		case VK_FORMAT_R32G32_UINT:
354 		case VK_FORMAT_R32G32B32_UINT:
355 		case VK_FORMAT_R32G32B32A32_UINT:
356 			v.x = *Pointer<Float4>(source0);
357 			v.y = *Pointer<Float4>(source1);
358 			v.z = *Pointer<Float4>(source2);
359 			v.w = *Pointer<Float4>(source3);
360 
361 			transpose4xN(v.x, v.y, v.z, v.w, componentCount);
362 			break;
363 		case VK_FORMAT_R16_SFLOAT:
364 		case VK_FORMAT_R16G16_SFLOAT:
365 		case VK_FORMAT_R16G16B16A16_SFLOAT:
366 		{
367 			if(componentCount >= 1)
368 			{
369 				UShort x0 = *Pointer<UShort>(source0 + 0);
370 				UShort x1 = *Pointer<UShort>(source1 + 0);
371 				UShort x2 = *Pointer<UShort>(source2 + 0);
372 				UShort x3 = *Pointer<UShort>(source3 + 0);
373 
374 				v.x.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x0) * 4);
375 				v.x.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x1) * 4);
376 				v.x.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x2) * 4);
377 				v.x.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x3) * 4);
378 			}
379 
380 			if(componentCount >= 2)
381 			{
382 				UShort y0 = *Pointer<UShort>(source0 + 2);
383 				UShort y1 = *Pointer<UShort>(source1 + 2);
384 				UShort y2 = *Pointer<UShort>(source2 + 2);
385 				UShort y3 = *Pointer<UShort>(source3 + 2);
386 
387 				v.y.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y0) * 4);
388 				v.y.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y1) * 4);
389 				v.y.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y2) * 4);
390 				v.y.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y3) * 4);
391 			}
392 
393 			if(componentCount >= 3)
394 			{
395 				UShort z0 = *Pointer<UShort>(source0 + 4);
396 				UShort z1 = *Pointer<UShort>(source1 + 4);
397 				UShort z2 = *Pointer<UShort>(source2 + 4);
398 				UShort z3 = *Pointer<UShort>(source3 + 4);
399 
400 				v.z.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z0) * 4);
401 				v.z.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z1) * 4);
402 				v.z.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z2) * 4);
403 				v.z.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z3) * 4);
404 			}
405 
406 			if(componentCount >= 4)
407 			{
408 				UShort w0 = *Pointer<UShort>(source0 + 6);
409 				UShort w1 = *Pointer<UShort>(source1 + 6);
410 				UShort w2 = *Pointer<UShort>(source2 + 6);
411 				UShort w3 = *Pointer<UShort>(source3 + 6);
412 
413 				v.w.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w0) * 4);
414 				v.w.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w1) * 4);
415 				v.w.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w2) * 4);
416 				v.w.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w3) * 4);
417 			}
418 		}
419 		break;
420 		case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
421 			bgra = true;
422 			// [[fallthrough]]
423 		case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
424 		{
425 			Int4 src;
426 			src = Insert(src, *Pointer<Int>(source0), 0);
427 			src = Insert(src, *Pointer<Int>(source1), 1);
428 			src = Insert(src, *Pointer<Int>(source2), 2);
429 			src = Insert(src, *Pointer<Int>(source3), 3);
430 			v.x = Float4((src << 22) >> 22);
431 			v.y = Float4((src << 12) >> 22);
432 			v.z = Float4((src << 02) >> 22);
433 			v.w = Float4(src >> 30);
434 
435 			v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
436 			v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
437 			v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
438 			v.w = Max(v.w, Float4(-1.0f));
439 		}
440 		break;
441 		case VK_FORMAT_A2R10G10B10_SINT_PACK32:
442 			bgra = true;
443 			// [[fallthrough]]
444 		case VK_FORMAT_A2B10G10R10_SINT_PACK32:
445 		{
446 			Int4 src;
447 			src = Insert(src, *Pointer<Int>(source0), 0);
448 			src = Insert(src, *Pointer<Int>(source1), 1);
449 			src = Insert(src, *Pointer<Int>(source2), 2);
450 			src = Insert(src, *Pointer<Int>(source3), 3);
451 			v.x = As<Float4>((src << 22) >> 22);
452 			v.y = As<Float4>((src << 12) >> 22);
453 			v.z = As<Float4>((src << 02) >> 22);
454 			v.w = As<Float4>(src >> 30);
455 		}
456 		break;
457 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
458 			bgra = true;
459 			// [[fallthrough]]
460 		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
461 		{
462 			Int4 src;
463 			src = Insert(src, *Pointer<Int>(source0), 0);
464 			src = Insert(src, *Pointer<Int>(source1), 1);
465 			src = Insert(src, *Pointer<Int>(source2), 2);
466 			src = Insert(src, *Pointer<Int>(source3), 3);
467 
468 			v.x = Float4(src & Int4(0x3FF));
469 			v.y = Float4((src >> 10) & Int4(0x3FF));
470 			v.z = Float4((src >> 20) & Int4(0x3FF));
471 			v.w = Float4((src >> 30) & Int4(0x3));
472 
473 			v.x *= Float4(1.0f / 0x3FF);
474 			v.y *= Float4(1.0f / 0x3FF);
475 			v.z *= Float4(1.0f / 0x3FF);
476 			v.w *= Float4(1.0f / 0x3);
477 		}
478 		break;
479 		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
480 			bgra = true;
481 			// [[fallthrough]]
482 		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
483 		{
484 			Int4 src;
485 			src = Insert(src, *Pointer<Int>(source0), 0);
486 			src = Insert(src, *Pointer<Int>(source1), 1);
487 			src = Insert(src, *Pointer<Int>(source2), 2);
488 			src = Insert(src, *Pointer<Int>(source3), 3);
489 
490 			v.x = As<Float4>(src & Int4(0x3FF));
491 			v.y = As<Float4>((src >> 10) & Int4(0x3FF));
492 			v.z = As<Float4>((src >> 20) & Int4(0x3FF));
493 			v.w = As<Float4>((src >> 30) & Int4(0x3));
494 		}
495 		break;
496 		default:
497 			UNSUPPORTED("stream.format %d", int(stream.format));
498 	}
499 
500 	if(bgra)
501 	{
502 		// Swap red and blue
503 		Float4 t = v.x;
504 		v.x = v.z;
505 		v.z = t;
506 	}
507 
508 	if(componentCount < 1) v.x = Float4(0.0f);
509 	if(componentCount < 2) v.y = Float4(0.0f);
510 	if(componentCount < 3) v.z = Float4(0.0f);
511 	if(componentCount < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
512 
513 	return v;
514 }
515 
writeCache(Pointer<Byte> & vertexCache,Pointer<UInt> & tagCache,Pointer<UInt> & batch)516 void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
517 {
518 	UInt index0 = batch[0];
519 	UInt index1 = batch[1];
520 	UInt index2 = batch[2];
521 	UInt index3 = batch[3];
522 
523 	UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
524 	UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
525 	UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
526 	UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
527 
528 	// We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
529 	// Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
530 	tagCache[cacheIndex3] = index3;
531 	tagCache[cacheIndex2] = index2;
532 	tagCache[cacheIndex1] = index1;
533 	tagCache[cacheIndex0] = index0;
534 
535 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
536 	if(it != spirvShader->outputBuiltins.end())
537 	{
538 		assert(it->second.SizeInComponents == 4);
539 		auto &position = routine.getVariable(it->second.Id);
540 
541 		Vector4f pos;
542 		pos.x = position[it->second.FirstComponent + 0];
543 		pos.y = position[it->second.FirstComponent + 1];
544 		pos.z = position[it->second.FirstComponent + 2];
545 		pos.w = position[it->second.FirstComponent + 3];
546 
547 		// Projection and viewport transform.
548 		Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
549 		Float4 rhw = Float4(1.0f) / w;
550 
551 		Vector4f proj;
552 		proj.x = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
553 		proj.y = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
554 		proj.z = pos.z * rhw;
555 		proj.w = rhw;
556 
557 		transpose4x4(pos.x, pos.y, pos.z, pos.w);
558 
559 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos.w;
560 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos.z;
561 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos.y;
562 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos.x;
563 
564 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
565 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
566 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
567 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
568 
569 		transpose4x4(proj.x, proj.y, proj.z, proj.w);
570 
571 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj.w;
572 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj.z;
573 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj.y;
574 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj.x;
575 	}
576 
577 	it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
578 	if(it != spirvShader->outputBuiltins.end())
579 	{
580 		ASSERT(it->second.SizeInComponents == 1);
581 		auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
582 
583 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, pointSize)) = Extract(psize, 3);
584 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, pointSize)) = Extract(psize, 2);
585 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, pointSize)) = Extract(psize, 1);
586 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, pointSize)) = Extract(psize, 0);
587 	}
588 
589 	it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
590 	if(it != spirvShader->outputBuiltins.end())
591 	{
592 		auto count = spirvShader->getNumOutputClipDistances();
593 		for(unsigned int i = 0; i < count; i++)
594 		{
595 			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
596 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 3);
597 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 2);
598 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 1);
599 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 0);
600 		}
601 	}
602 
603 	it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
604 	if(it != spirvShader->outputBuiltins.end())
605 	{
606 		auto count = spirvShader->getNumOutputCullDistances();
607 		for(unsigned int i = 0; i < count; i++)
608 		{
609 			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
610 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 3);
611 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 2);
612 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 1);
613 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 0);
614 		}
615 	}
616 
617 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullMask)) = -((cullMask >> 3) & 1);
618 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullMask)) = -((cullMask >> 2) & 1);
619 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullMask)) = -((cullMask >> 1) & 1);
620 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullMask)) = -((cullMask >> 0) & 1);
621 
622 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
623 	{
624 		if(spirvShader->outputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
625 		   spirvShader->outputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
626 		   spirvShader->outputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
627 		   spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
628 		{
629 			Vector4f v;
630 			v.x = routine.outputs[i + 0];
631 			v.y = routine.outputs[i + 1];
632 			v.z = routine.outputs[i + 2];
633 			v.w = routine.outputs[i + 3];
634 
635 			transpose4x4(v.x, v.y, v.z, v.w);
636 
637 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, v[i]), 16) = v.w;
638 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, v[i]), 16) = v.z;
639 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, v[i]), 16) = v.y;
640 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, v[i]), 16) = v.x;
641 		}
642 	}
643 }
644 
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cacheEntry)645 void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
646 {
647 	*Pointer<Int4>(vertex + OFFSET(Vertex, position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, position));
648 	*Pointer<Int>(vertex + OFFSET(Vertex, pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, pointSize));
649 
650 	*Pointer<Int>(vertex + OFFSET(Vertex, clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, clipFlags));
651 	*Pointer<Int>(vertex + OFFSET(Vertex, cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, cullMask));
652 	*Pointer<Int4>(vertex + OFFSET(Vertex, projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, projected));
653 
654 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
655 	{
656 		if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
657 		{
658 			*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
659 		}
660 	}
661 	for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
662 	{
663 		*Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
664 	}
665 	for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
666 	{
667 		*Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
668 	}
669 }
670 
671 }  // namespace sw
672