1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "VertexRoutine.hpp"
16 
17 #include "VertexShader.hpp"
18 #include "Constants.hpp"
19 #include "Device/Vertex.hpp"
20 #include "Device/Renderer.hpp"
21 #include "System/Half.hpp"
22 #include "Vulkan/VkDebug.hpp"
23 
24 namespace sw
25 {
26 	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
27 	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
28 
VertexRoutine(const VertexProcessor::State & state,const VertexShader * shader)29 	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
30 		: v(shader && shader->indirectAddressableInput),
31 		  o(shader && shader->indirectAddressableOutput),
32 		  state(state)
33 	{
34 	}
35 
~VertexRoutine()36 	VertexRoutine::~VertexRoutine()
37 	{
38 	}
39 
generate()40 	void VertexRoutine::generate()
41 	{
42 		const bool textureSampling = state.textureSampling;
43 
44 		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
45 		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
46 		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
47 
48 		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
49 		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
50 		UInt indexInPrimitive = 0;
51 
52 		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
53 
54 		Do
55 		{
56 			UInt index = *Pointer<UInt>(batch);
57 			UInt tagIndex = index & 0x0000003C;
58 			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
59 
60 			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
61 			{
62 				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
63 
64 				readInput(indexQ);
65 				program(indexQ);
66 				postTransform();
67 				computeClipFlags();
68 
69 				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
70 				writeCache(cacheLine0);
71 			}
72 
73 			UInt cacheIndex = index & 0x0000003F;
74 			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
75 			writeVertex(vertex, cacheLine);
76 
77 			if(state.transformFeedbackEnabled != 0)
78 			{
79 				transformFeedback(vertex, primitiveNumber, indexInPrimitive);
80 
81 				indexInPrimitive++;
82 				If(indexInPrimitive == 3)
83 				{
84 					primitiveNumber++;
85 					indexInPrimitive = 0;
86 				}
87 			}
88 
89 			vertex += sizeof(Vertex);
90 			batch += sizeof(unsigned int);
91 			vertexCount--;
92 		}
93 		Until(vertexCount == 0)
94 
95 		Return();
96 	}
97 
readInput(UInt & index)98 	void VertexRoutine::readInput(UInt &index)
99 	{
100 		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
101 		{
102 			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
103 			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
104 
105 			v[i] = readStream(input, stride, state.input[i], index);
106 		}
107 	}
108 
computeClipFlags()109 	void VertexRoutine::computeClipFlags()
110 	{
111 		int pos = state.positionRegister;
112 
113 		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
114 		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
115 		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
116 		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
117 		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
118 		Int4 minZ = CmpNLE(Float4(0.0f), o[pos].z);
119 
120 		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
121 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
122 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
123 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
124 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
125 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
126 
127 		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
128 		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
129 		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
130 
131 		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
132 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
133 	}
134 
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,const UInt & index)135 	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
136 	{
137 		const bool textureSampling = state.textureSampling;
138 
139 		Vector4f v;
140 
141 		Pointer<Byte> source0 = buffer + index * stride;
142 		Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
143 		Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
144 		Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
145 
146 		bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized;
147 
148 		switch(stream.type)
149 		{
150 		case STREAMTYPE_FLOAT:
151 			{
152 				if(stream.count == 0)
153 				{
154 					// Null stream, all default components
155 				}
156 				else
157 				{
158 					if(stream.count == 1)
159 					{
160 						v.x.x = *Pointer<Float>(source0);
161 						v.x.y = *Pointer<Float>(source1);
162 						v.x.z = *Pointer<Float>(source2);
163 						v.x.w = *Pointer<Float>(source3);
164 					}
165 					else
166 					{
167 						v.x = *Pointer<Float4>(source0);
168 						v.y = *Pointer<Float4>(source1);
169 						v.z = *Pointer<Float4>(source2);
170 						v.w = *Pointer<Float4>(source3);
171 
172 						transpose4xN(v.x, v.y, v.z, v.w, stream.count);
173 					}
174 
175 					switch(stream.attribType)
176 					{
177 					case SpirvShader::ATTRIBTYPE_INT:
178 						if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
179 						if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
180 						if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
181 						if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
182 						break;
183 					case SpirvShader::ATTRIBTYPE_UINT:
184 						if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
185 						if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
186 						if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
187 						if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
188 						break;
189 					default:
190 						break;
191 					}
192 				}
193 			}
194 			break;
195 		case STREAMTYPE_BYTE:
196 			if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
197 			{
198 				v.x = Float4(*Pointer<Byte4>(source0));
199 				v.y = Float4(*Pointer<Byte4>(source1));
200 				v.z = Float4(*Pointer<Byte4>(source2));
201 				v.w = Float4(*Pointer<Byte4>(source3));
202 
203 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
204 
205 				if(stream.normalized)
206 				{
207 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
208 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
209 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
210 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
211 				}
212 			}
213 			else // Stream: UByte, Shader attrib: Int / UInt
214 			{
215 				v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
216 				v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
217 				v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
218 				v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
219 
220 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
221 			}
222 			break;
223 		case STREAMTYPE_SBYTE:
224 			if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
225 			{
226 				v.x = Float4(*Pointer<SByte4>(source0));
227 				v.y = Float4(*Pointer<SByte4>(source1));
228 				v.z = Float4(*Pointer<SByte4>(source2));
229 				v.w = Float4(*Pointer<SByte4>(source3));
230 
231 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
232 
233 				if(stream.normalized)
234 				{
235 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
236 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
237 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
238 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
239 				}
240 			}
241 			else // Stream: SByte, Shader attrib: Int / UInt
242 			{
243 				v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
244 				v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
245 				v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
246 				v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
247 
248 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
249 			}
250 			break;
251 		case STREAMTYPE_COLOR:
252 			{
253 				v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
254 				v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
255 				v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
256 				v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
257 
258 				transpose4x4(v.x, v.y, v.z, v.w);
259 
260 				// Swap red and blue
261 				Float4 t = v.x;
262 				v.x = v.z;
263 				v.z = t;
264 			}
265 			break;
266 		case STREAMTYPE_SHORT:
267 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
268 			{
269 				v.x = Float4(*Pointer<Short4>(source0));
270 				v.y = Float4(*Pointer<Short4>(source1));
271 				v.z = Float4(*Pointer<Short4>(source2));
272 				v.w = Float4(*Pointer<Short4>(source3));
273 
274 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
275 
276 				if(stream.normalized)
277 				{
278 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
279 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
280 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
281 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
282 				}
283 			}
284 			else // Stream: Short, Shader attrib: Int/UInt, no type conversion
285 			{
286 				v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
287 				v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
288 				v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
289 				v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
290 
291 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
292 			}
293 			break;
294 		case STREAMTYPE_USHORT:
295 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
296 			{
297 				v.x = Float4(*Pointer<UShort4>(source0));
298 				v.y = Float4(*Pointer<UShort4>(source1));
299 				v.z = Float4(*Pointer<UShort4>(source2));
300 				v.w = Float4(*Pointer<UShort4>(source3));
301 
302 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
303 
304 				if(stream.normalized)
305 				{
306 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
307 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
308 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
309 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
310 				}
311 			}
312 			else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
313 			{
314 				v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
315 				v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
316 				v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
317 				v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
318 
319 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
320 			}
321 			break;
322 		case STREAMTYPE_INT:
323 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
324 			{
325 				v.x = Float4(*Pointer<Int4>(source0));
326 				v.y = Float4(*Pointer<Int4>(source1));
327 				v.z = Float4(*Pointer<Int4>(source2));
328 				v.w = Float4(*Pointer<Int4>(source3));
329 
330 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
331 
332 				if(stream.normalized)
333 				{
334 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
335 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
336 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
337 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
338 				}
339 			}
340 			else // Stream: Int, Shader attrib: Int/UInt, no type conversion
341 			{
342 				v.x = *Pointer<Float4>(source0);
343 				v.y = *Pointer<Float4>(source1);
344 				v.z = *Pointer<Float4>(source2);
345 				v.w = *Pointer<Float4>(source3);
346 
347 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
348 			}
349 			break;
350 		case STREAMTYPE_UINT:
351 			if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
352 			{
353 				v.x = Float4(*Pointer<UInt4>(source0));
354 				v.y = Float4(*Pointer<UInt4>(source1));
355 				v.z = Float4(*Pointer<UInt4>(source2));
356 				v.w = Float4(*Pointer<UInt4>(source3));
357 
358 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
359 
360 				if(stream.normalized)
361 				{
362 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
363 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
364 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
365 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
366 				}
367 			}
368 			else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
369 			{
370 				v.x = *Pointer<Float4>(source0);
371 				v.y = *Pointer<Float4>(source1);
372 				v.z = *Pointer<Float4>(source2);
373 				v.w = *Pointer<Float4>(source3);
374 
375 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
376 			}
377 			break;
378 		case STREAMTYPE_UDEC3:
379 			{
380 				// FIXME: Vectorize
381 				{
382 					Int x, y, z;
383 
384 					x = y = z = *Pointer<Int>(source0);
385 
386 					v.x.x = Float(x & 0x000003FF);
387 					v.x.y = Float(y & 0x000FFC00);
388 					v.x.z = Float(z & 0x3FF00000);
389 				}
390 
391 				{
392 					Int x, y, z;
393 
394 					x = y = z = *Pointer<Int>(source1);
395 
396 					v.y.x = Float(x & 0x000003FF);
397 					v.y.y = Float(y & 0x000FFC00);
398 					v.y.z = Float(z & 0x3FF00000);
399 				}
400 
401 				{
402 					Int x, y, z;
403 
404 					x = y = z = *Pointer<Int>(source2);
405 
406 					v.z.x = Float(x & 0x000003FF);
407 					v.z.y = Float(y & 0x000FFC00);
408 					v.z.z = Float(z & 0x3FF00000);
409 				}
410 
411 				{
412 					Int x, y, z;
413 
414 					x = y = z = *Pointer<Int>(source3);
415 
416 					v.w.x = Float(x & 0x000003FF);
417 					v.w.y = Float(y & 0x000FFC00);
418 					v.w.z = Float(z & 0x3FF00000);
419 				}
420 
421 				transpose4x3(v.x, v.y, v.z, v.w);
422 
423 				v.y *= Float4(1.0f / 0x00000400);
424 				v.z *= Float4(1.0f / 0x00100000);
425 			}
426 			break;
427 		case STREAMTYPE_DEC3N:
428 			{
429 				// FIXME: Vectorize
430 				{
431 					Int x, y, z;
432 
433 					x = y = z = *Pointer<Int>(source0);
434 
435 					v.x.x = Float((x << 22) & 0xFFC00000);
436 					v.x.y = Float((y << 12) & 0xFFC00000);
437 					v.x.z = Float((z << 2)  & 0xFFC00000);
438 				}
439 
440 				{
441 					Int x, y, z;
442 
443 					x = y = z = *Pointer<Int>(source1);
444 
445 					v.y.x = Float((x << 22) & 0xFFC00000);
446 					v.y.y = Float((y << 12) & 0xFFC00000);
447 					v.y.z = Float((z << 2)  & 0xFFC00000);
448 				}
449 
450 				{
451 					Int x, y, z;
452 
453 					x = y = z = *Pointer<Int>(source2);
454 
455 					v.z.x = Float((x << 22) & 0xFFC00000);
456 					v.z.y = Float((y << 12) & 0xFFC00000);
457 					v.z.z = Float((z << 2)  & 0xFFC00000);
458 				}
459 
460 				{
461 					Int x, y, z;
462 
463 					x = y = z = *Pointer<Int>(source3);
464 
465 					v.w.x = Float((x << 22) & 0xFFC00000);
466 					v.w.y = Float((y << 12) & 0xFFC00000);
467 					v.w.z = Float((z << 2)  & 0xFFC00000);
468 				}
469 
470 				transpose4x3(v.x, v.y, v.z, v.w);
471 
472 				v.x *= Float4(1.0f / 0x00400000 / 511.0f);
473 				v.y *= Float4(1.0f / 0x00400000 / 511.0f);
474 				v.z *= Float4(1.0f / 0x00400000 / 511.0f);
475 			}
476 			break;
477 		case STREAMTYPE_FIXED:
478 			{
479 				v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
480 				v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
481 				v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
482 				v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
483 
484 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
485 			}
486 			break;
487 		case STREAMTYPE_HALF:
488 			{
489 				if(stream.count >= 1)
490 				{
491 					UShort x0 = *Pointer<UShort>(source0 + 0);
492 					UShort x1 = *Pointer<UShort>(source1 + 0);
493 					UShort x2 = *Pointer<UShort>(source2 + 0);
494 					UShort x3 = *Pointer<UShort>(source3 + 0);
495 
496 					v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
497 					v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
498 					v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
499 					v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
500 				}
501 
502 				if(stream.count >= 2)
503 				{
504 					UShort y0 = *Pointer<UShort>(source0 + 2);
505 					UShort y1 = *Pointer<UShort>(source1 + 2);
506 					UShort y2 = *Pointer<UShort>(source2 + 2);
507 					UShort y3 = *Pointer<UShort>(source3 + 2);
508 
509 					v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
510 					v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
511 					v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
512 					v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
513 				}
514 
515 				if(stream.count >= 3)
516 				{
517 					UShort z0 = *Pointer<UShort>(source0 + 4);
518 					UShort z1 = *Pointer<UShort>(source1 + 4);
519 					UShort z2 = *Pointer<UShort>(source2 + 4);
520 					UShort z3 = *Pointer<UShort>(source3 + 4);
521 
522 					v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
523 					v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
524 					v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
525 					v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
526 				}
527 
528 				if(stream.count >= 4)
529 				{
530 					UShort w0 = *Pointer<UShort>(source0 + 6);
531 					UShort w1 = *Pointer<UShort>(source1 + 6);
532 					UShort w2 = *Pointer<UShort>(source2 + 6);
533 					UShort w3 = *Pointer<UShort>(source3 + 6);
534 
535 					v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
536 					v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
537 					v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
538 					v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
539 				}
540 			}
541 			break;
542 		case STREAMTYPE_INDICES:
543 			{
544 				v.x.x = *Pointer<Float>(source0);
545 				v.x.y = *Pointer<Float>(source1);
546 				v.x.z = *Pointer<Float>(source2);
547 				v.x.w = *Pointer<Float>(source3);
548 			}
549 			break;
550 		case STREAMTYPE_2_10_10_10_INT:
551 			{
552 				Int4 src;
553 				src = Insert(src, *Pointer<Int>(source0), 0);
554 				src = Insert(src, *Pointer<Int>(source1), 1);
555 				src = Insert(src, *Pointer<Int>(source2), 2);
556 				src = Insert(src, *Pointer<Int>(source3), 3);
557 
558 				v.x = Float4((src << 22) >> 22);
559 				v.y = Float4((src << 12) >> 22);
560 				v.z = Float4((src << 02) >> 22);
561 				v.w = Float4(src >> 30);
562 
563 				if(stream.normalized)
564 				{
565 					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
566 					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
567 					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
568 					v.w = Max(v.w, Float4(-1.0f));
569 				}
570 			}
571 			break;
572 		case STREAMTYPE_2_10_10_10_UINT:
573 			{
574 				Int4 src;
575 				src = Insert(src, *Pointer<Int>(source0), 0);
576 				src = Insert(src, *Pointer<Int>(source1), 1);
577 				src = Insert(src, *Pointer<Int>(source2), 2);
578 				src = Insert(src, *Pointer<Int>(source3), 3);
579 
580 				v.x = Float4(src & Int4(0x3FF));
581 				v.y = Float4((src >> 10) & Int4(0x3FF));
582 				v.z = Float4((src >> 20) & Int4(0x3FF));
583 				v.w = Float4((src >> 30) & Int4(0x3));
584 
585 				if(stream.normalized)
586 				{
587 					v.x *= Float4(1.0f / 0x3FF);
588 					v.y *= Float4(1.0f / 0x3FF);
589 					v.z *= Float4(1.0f / 0x3FF);
590 					v.w *= Float4(1.0f / 0x3);
591 				}
592 			}
593 			break;
594 		default:
595 			ASSERT(false);
596 		}
597 
598 		if(stream.count < 1) v.x = Float4(0.0f);
599 		if(stream.count < 2) v.y = Float4(0.0f);
600 		if(stream.count < 3) v.z = Float4(0.0f);
601 		if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(0));
602 
603 		return v;
604 	}
605 
postTransform()606 	void VertexRoutine::postTransform()
607 	{
608 		int pos = state.positionRegister;
609 
610 		if(!halfIntegerCoordinates)
611 		{
612 			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
613 			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
614 		}
615 	}
616 
writeCache(Pointer<Byte> & cacheLine)617 	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
618 	{
619 		Vector4f v;
620 
621 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
622 		{
623 			if(state.output[i].write)
624 			{
625 				v.x = o[i].x;
626 				v.y = o[i].y;
627 				v.z = o[i].z;
628 				v.w = o[i].w;
629 
630 				if(state.output[i].xClamp)
631 				{
632 					v.x = Max(v.x, Float4(0.0f));
633 					v.x = Min(v.x, Float4(1.0f));
634 				}
635 
636 				if(state.output[i].yClamp)
637 				{
638 					v.y = Max(v.y, Float4(0.0f));
639 					v.y = Min(v.y, Float4(1.0f));
640 				}
641 
642 				if(state.output[i].zClamp)
643 				{
644 					v.z = Max(v.z, Float4(0.0f));
645 					v.z = Min(v.z, Float4(1.0f));
646 				}
647 
648 				if(state.output[i].wClamp)
649 				{
650 					v.w = Max(v.w, Float4(0.0f));
651 					v.w = Min(v.w, Float4(1.0f));
652 				}
653 
654 				if(state.output[i].write == 0x01)
655 				{
656 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
657 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
658 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
659 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
660 				}
661 				else
662 				{
663 					if(state.output[i].write == 0x03)
664 					{
665 						transpose2x4(v.x, v.y, v.z, v.w);
666 					}
667 					else
668 					{
669 						transpose4x4(v.x, v.y, v.z, v.w);
670 					}
671 
672 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
673 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
674 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
675 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
676 				}
677 			}
678 		}
679 
680 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
681 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
682 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
683 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
684 
685 		// Viewport transform
686 		int pos = state.positionRegister;
687 
688 		v.x = o[pos].x;
689 		v.y = o[pos].y;
690 		v.z = o[pos].z;
691 		v.w = o[pos].w;
692 
693 		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
694 		Float4 rhw = Float4(1.0f) / w;
695 
696 		v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
697 		v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
698 		v.z = v.z * rhw;
699 		v.w = rhw;
700 
701 		transpose4x4(v.x, v.y, v.z, v.w);
702 
703 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
704 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
705 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
706 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
707 	}
708 
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cache)709 	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
710 	{
711 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
712 		{
713 			if(state.output[i].write)
714 			{
715 				*Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
716 			}
717 		}
718 
719 		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
720 		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
721 	}
722 
transformFeedback(const Pointer<Byte> & vertex,const UInt & primitiveNumber,const UInt & indexInPrimitive)723 	void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
724 	{
725 		If(indexInPrimitive < state.verticesPerPrimitive)
726 		{
727 			UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
728 
729 			for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
730 			{
731 				if(state.transformFeedbackEnabled & (1ULL << i))
732 				{
733 					UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
734 					UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
735 					UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
736 					UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
737 
738 					Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
739 					Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
740 
741 					For(UInt r = 0, r < row, r++)
742 					{
743 						UInt rOffsetX = r * col * sizeof(float);
744 						UInt rOffset4 = r * sizeof(float4);
745 
746 						For(UInt c = 0, c < col, c++)
747 						{
748 							UInt cOffset = c * sizeof(float);
749 							*Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
750 						}
751 					}
752 				}
753 			}
754 		}
755 	}
756 }
757