1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "VertexRoutine.hpp"
16 
17 #include "VertexShader.hpp"
18 #include "Constants.hpp"
19 #include "Renderer/Vertex.hpp"
20 #include "Renderer/Renderer.hpp"
21 #include "Common/Half.hpp"
22 #include "Common/Debug.hpp"
23 
24 namespace sw
25 {
26 	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
27 	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
28 
VertexRoutine(const VertexProcessor::State & state,const VertexShader * shader)29 	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
30 		: v(shader && shader->indirectAddressableInput),
31 		  o(shader && shader->indirectAddressableOutput),
32 		  state(state)
33 	{
34 	}
35 
~VertexRoutine()36 	VertexRoutine::~VertexRoutine()
37 	{
38 	}
39 
generate()40 	void VertexRoutine::generate()
41 	{
42 		const bool textureSampling = state.textureSampling;
43 
44 		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
45 		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
46 		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
47 
48 		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
49 		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
50 		UInt indexInPrimitive = 0;
51 
52 		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
53 
54 		Do
55 		{
56 			UInt index = *Pointer<UInt>(batch);
57 			UInt tagIndex = index & 0x0000003C;
58 			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
59 
60 			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
61 			{
62 				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
63 
64 				readInput(indexQ);
65 				pipeline(indexQ);
66 				postTransform();
67 				computeClipFlags();
68 
69 				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
70 				writeCache(cacheLine0);
71 			}
72 
73 			UInt cacheIndex = index & 0x0000003F;
74 			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
75 			writeVertex(vertex, cacheLine);
76 
77 			if(state.transformFeedbackEnabled != 0)
78 			{
79 				transformFeedback(vertex, primitiveNumber, indexInPrimitive);
80 
81 				indexInPrimitive++;
82 				If(indexInPrimitive == 3)
83 				{
84 					primitiveNumber++;
85 					indexInPrimitive = 0;
86 				}
87 			}
88 
89 			vertex += sizeof(Vertex);
90 			batch += sizeof(unsigned int);
91 			vertexCount--;
92 		}
93 		Until(vertexCount == 0)
94 
95 		Return();
96 	}
97 
readInput(UInt & index)98 	void VertexRoutine::readInput(UInt &index)
99 	{
100 		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
101 		{
102 			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
103 			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
104 
105 			v[i] = readStream(input, stride, state.input[i], index);
106 		}
107 	}
108 
computeClipFlags()109 	void VertexRoutine::computeClipFlags()
110 	{
111 		int pos = state.positionRegister;
112 
113 		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
114 		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
115 		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
116 		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
117 		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
118 		Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);
119 
120 		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
121 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
122 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
123 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
124 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
125 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
126 
127 		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
128 		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
129 		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
130 
131 		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
132 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
133 
134 		if(state.preTransformed)
135 		{
136 			clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
137 		}
138 	}
139 
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,const UInt & index)140 	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
141 	{
142 		const bool textureSampling = state.textureSampling;
143 
144 		Vector4f v;
145 
146 		Pointer<Byte> source0 = buffer + index * stride;
147 		Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
148 		Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
149 		Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
150 
151 		bool isNativeFloatAttrib = (stream.attribType == VertexShader::ATTRIBTYPE_FLOAT) || stream.normalized;
152 
153 		switch(stream.type)
154 		{
155 		case STREAMTYPE_FLOAT:
156 			{
157 				if(stream.count == 0)
158 				{
159 					// Null stream, all default components
160 				}
161 				else
162 				{
163 					if(stream.count == 1)
164 					{
165 						v.x.x = *Pointer<Float>(source0);
166 						v.x.y = *Pointer<Float>(source1);
167 						v.x.z = *Pointer<Float>(source2);
168 						v.x.w = *Pointer<Float>(source3);
169 					}
170 					else
171 					{
172 						v.x = *Pointer<Float4>(source0);
173 						v.y = *Pointer<Float4>(source1);
174 						v.z = *Pointer<Float4>(source2);
175 						v.w = *Pointer<Float4>(source3);
176 
177 						transpose4xN(v.x, v.y, v.z, v.w, stream.count);
178 					}
179 
180 					switch(stream.attribType)
181 					{
182 					case VertexShader::ATTRIBTYPE_INT:
183 						if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
184 						if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
185 						if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
186 						if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
187 						break;
188 					case VertexShader::ATTRIBTYPE_UINT:
189 						if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
190 						if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
191 						if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
192 						if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
193 						break;
194 					default:
195 						break;
196 					}
197 				}
198 			}
199 			break;
200 		case STREAMTYPE_BYTE:
201 			if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
202 			{
203 				v.x = Float4(*Pointer<Byte4>(source0));
204 				v.y = Float4(*Pointer<Byte4>(source1));
205 				v.z = Float4(*Pointer<Byte4>(source2));
206 				v.w = Float4(*Pointer<Byte4>(source3));
207 
208 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
209 
210 				if(stream.normalized)
211 				{
212 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
213 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
214 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
215 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
216 				}
217 			}
218 			else // Stream: UByte, Shader attrib: Int / UInt
219 			{
220 				v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
221 				v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
222 				v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
223 				v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
224 
225 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
226 			}
227 			break;
228 		case STREAMTYPE_SBYTE:
229 			if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
230 			{
231 				v.x = Float4(*Pointer<SByte4>(source0));
232 				v.y = Float4(*Pointer<SByte4>(source1));
233 				v.z = Float4(*Pointer<SByte4>(source2));
234 				v.w = Float4(*Pointer<SByte4>(source3));
235 
236 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
237 
238 				if(stream.normalized)
239 				{
240 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
241 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
242 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
243 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
244 				}
245 			}
246 			else // Stream: SByte, Shader attrib: Int / UInt
247 			{
248 				v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
249 				v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
250 				v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
251 				v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
252 
253 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
254 			}
255 			break;
256 		case STREAMTYPE_COLOR:
257 			{
258 				v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
259 				v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
260 				v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
261 				v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
262 
263 				transpose4x4(v.x, v.y, v.z, v.w);
264 
265 				// Swap red and blue
266 				Float4 t = v.x;
267 				v.x = v.z;
268 				v.z = t;
269 			}
270 			break;
271 		case STREAMTYPE_SHORT:
272 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
273 			{
274 				v.x = Float4(*Pointer<Short4>(source0));
275 				v.y = Float4(*Pointer<Short4>(source1));
276 				v.z = Float4(*Pointer<Short4>(source2));
277 				v.w = Float4(*Pointer<Short4>(source3));
278 
279 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
280 
281 				if(stream.normalized)
282 				{
283 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
284 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
285 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
286 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
287 				}
288 			}
289 			else // Stream: Short, Shader attrib: Int/UInt, no type conversion
290 			{
291 				v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
292 				v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
293 				v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
294 				v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
295 
296 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
297 			}
298 			break;
299 		case STREAMTYPE_USHORT:
300 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
301 			{
302 				v.x = Float4(*Pointer<UShort4>(source0));
303 				v.y = Float4(*Pointer<UShort4>(source1));
304 				v.z = Float4(*Pointer<UShort4>(source2));
305 				v.w = Float4(*Pointer<UShort4>(source3));
306 
307 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
308 
309 				if(stream.normalized)
310 				{
311 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
312 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
313 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
314 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
315 				}
316 			}
317 			else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
318 			{
319 				v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
320 				v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
321 				v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
322 				v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
323 
324 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
325 			}
326 			break;
327 		case STREAMTYPE_INT:
328 			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
329 			{
330 				v.x = Float4(*Pointer<Int4>(source0));
331 				v.y = Float4(*Pointer<Int4>(source1));
332 				v.z = Float4(*Pointer<Int4>(source2));
333 				v.w = Float4(*Pointer<Int4>(source3));
334 
335 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
336 
337 				if(stream.normalized)
338 				{
339 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
340 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
341 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
342 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
343 				}
344 			}
345 			else // Stream: Int, Shader attrib: Int/UInt, no type conversion
346 			{
347 				v.x = *Pointer<Float4>(source0);
348 				v.y = *Pointer<Float4>(source1);
349 				v.z = *Pointer<Float4>(source2);
350 				v.w = *Pointer<Float4>(source3);
351 
352 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
353 			}
354 			break;
355 		case STREAMTYPE_UINT:
356 			if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
357 			{
358 				v.x = Float4(*Pointer<UInt4>(source0));
359 				v.y = Float4(*Pointer<UInt4>(source1));
360 				v.z = Float4(*Pointer<UInt4>(source2));
361 				v.w = Float4(*Pointer<UInt4>(source3));
362 
363 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
364 
365 				if(stream.normalized)
366 				{
367 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
368 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
369 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
370 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
371 				}
372 			}
373 			else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
374 			{
375 				v.x = *Pointer<Float4>(source0);
376 				v.y = *Pointer<Float4>(source1);
377 				v.z = *Pointer<Float4>(source2);
378 				v.w = *Pointer<Float4>(source3);
379 
380 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
381 			}
382 			break;
383 		case STREAMTYPE_UDEC3:
384 			{
385 				// FIXME: Vectorize
386 				{
387 					Int x, y, z;
388 
389 					x = y = z = *Pointer<Int>(source0);
390 
391 					v.x.x = Float(x & 0x000003FF);
392 					v.x.y = Float(y & 0x000FFC00);
393 					v.x.z = Float(z & 0x3FF00000);
394 				}
395 
396 				{
397 					Int x, y, z;
398 
399 					x = y = z = *Pointer<Int>(source1);
400 
401 					v.y.x = Float(x & 0x000003FF);
402 					v.y.y = Float(y & 0x000FFC00);
403 					v.y.z = Float(z & 0x3FF00000);
404 				}
405 
406 				{
407 					Int x, y, z;
408 
409 					x = y = z = *Pointer<Int>(source2);
410 
411 					v.z.x = Float(x & 0x000003FF);
412 					v.z.y = Float(y & 0x000FFC00);
413 					v.z.z = Float(z & 0x3FF00000);
414 				}
415 
416 				{
417 					Int x, y, z;
418 
419 					x = y = z = *Pointer<Int>(source3);
420 
421 					v.w.x = Float(x & 0x000003FF);
422 					v.w.y = Float(y & 0x000FFC00);
423 					v.w.z = Float(z & 0x3FF00000);
424 				}
425 
426 				transpose4x3(v.x, v.y, v.z, v.w);
427 
428 				v.y *= Float4(1.0f / 0x00000400);
429 				v.z *= Float4(1.0f / 0x00100000);
430 			}
431 			break;
432 		case STREAMTYPE_DEC3N:
433 			{
434 				// FIXME: Vectorize
435 				{
436 					Int x, y, z;
437 
438 					x = y = z = *Pointer<Int>(source0);
439 
440 					v.x.x = Float((x << 22) & 0xFFC00000);
441 					v.x.y = Float((y << 12) & 0xFFC00000);
442 					v.x.z = Float((z << 2)  & 0xFFC00000);
443 				}
444 
445 				{
446 					Int x, y, z;
447 
448 					x = y = z = *Pointer<Int>(source1);
449 
450 					v.y.x = Float((x << 22) & 0xFFC00000);
451 					v.y.y = Float((y << 12) & 0xFFC00000);
452 					v.y.z = Float((z << 2)  & 0xFFC00000);
453 				}
454 
455 				{
456 					Int x, y, z;
457 
458 					x = y = z = *Pointer<Int>(source2);
459 
460 					v.z.x = Float((x << 22) & 0xFFC00000);
461 					v.z.y = Float((y << 12) & 0xFFC00000);
462 					v.z.z = Float((z << 2)  & 0xFFC00000);
463 				}
464 
465 				{
466 					Int x, y, z;
467 
468 					x = y = z = *Pointer<Int>(source3);
469 
470 					v.w.x = Float((x << 22) & 0xFFC00000);
471 					v.w.y = Float((y << 12) & 0xFFC00000);
472 					v.w.z = Float((z << 2)  & 0xFFC00000);
473 				}
474 
475 				transpose4x3(v.x, v.y, v.z, v.w);
476 
477 				v.x *= Float4(1.0f / 0x00400000 / 511.0f);
478 				v.y *= Float4(1.0f / 0x00400000 / 511.0f);
479 				v.z *= Float4(1.0f / 0x00400000 / 511.0f);
480 			}
481 			break;
482 		case STREAMTYPE_FIXED:
483 			{
484 				v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
485 				v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
486 				v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
487 				v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
488 
489 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
490 			}
491 			break;
492 		case STREAMTYPE_HALF:
493 			{
494 				if(stream.count >= 1)
495 				{
496 					UShort x0 = *Pointer<UShort>(source0 + 0);
497 					UShort x1 = *Pointer<UShort>(source1 + 0);
498 					UShort x2 = *Pointer<UShort>(source2 + 0);
499 					UShort x3 = *Pointer<UShort>(source3 + 0);
500 
501 					v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
502 					v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
503 					v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
504 					v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
505 				}
506 
507 				if(stream.count >= 2)
508 				{
509 					UShort y0 = *Pointer<UShort>(source0 + 2);
510 					UShort y1 = *Pointer<UShort>(source1 + 2);
511 					UShort y2 = *Pointer<UShort>(source2 + 2);
512 					UShort y3 = *Pointer<UShort>(source3 + 2);
513 
514 					v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
515 					v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
516 					v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
517 					v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
518 				}
519 
520 				if(stream.count >= 3)
521 				{
522 					UShort z0 = *Pointer<UShort>(source0 + 4);
523 					UShort z1 = *Pointer<UShort>(source1 + 4);
524 					UShort z2 = *Pointer<UShort>(source2 + 4);
525 					UShort z3 = *Pointer<UShort>(source3 + 4);
526 
527 					v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
528 					v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
529 					v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
530 					v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
531 				}
532 
533 				if(stream.count >= 4)
534 				{
535 					UShort w0 = *Pointer<UShort>(source0 + 6);
536 					UShort w1 = *Pointer<UShort>(source1 + 6);
537 					UShort w2 = *Pointer<UShort>(source2 + 6);
538 					UShort w3 = *Pointer<UShort>(source3 + 6);
539 
540 					v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
541 					v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
542 					v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
543 					v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
544 				}
545 			}
546 			break;
547 		case STREAMTYPE_INDICES:
548 			{
549 				v.x.x = *Pointer<Float>(source0);
550 				v.x.y = *Pointer<Float>(source1);
551 				v.x.z = *Pointer<Float>(source2);
552 				v.x.w = *Pointer<Float>(source3);
553 			}
554 			break;
555 		case STREAMTYPE_2_10_10_10_INT:
556 			{
557 				Int4 src;
558 				src = Insert(src, *Pointer<Int>(source0), 0);
559 				src = Insert(src, *Pointer<Int>(source1), 1);
560 				src = Insert(src, *Pointer<Int>(source2), 2);
561 				src = Insert(src, *Pointer<Int>(source3), 3);
562 
563 				v.x = Float4((src << 22) >> 22);
564 				v.y = Float4((src << 12) >> 22);
565 				v.z = Float4((src << 02) >> 22);
566 				v.w = Float4(src >> 30);
567 
568 				if(stream.normalized)
569 				{
570 					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
571 					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
572 					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
573 					v.w = Max(v.w, Float4(-1.0f));
574 				}
575 			}
576 			break;
577 		case STREAMTYPE_2_10_10_10_UINT:
578 			{
579 				Int4 src;
580 				src = Insert(src, *Pointer<Int>(source0), 0);
581 				src = Insert(src, *Pointer<Int>(source1), 1);
582 				src = Insert(src, *Pointer<Int>(source2), 2);
583 				src = Insert(src, *Pointer<Int>(source3), 3);
584 
585 				v.x = Float4(src & Int4(0x3FF));
586 				v.y = Float4((src >> 10) & Int4(0x3FF));
587 				v.z = Float4((src >> 20) & Int4(0x3FF));
588 				v.w = Float4((src >> 30) & Int4(0x3));
589 
590 				if(stream.normalized)
591 				{
592 					v.x *= Float4(1.0f / 0x3FF);
593 					v.y *= Float4(1.0f / 0x3FF);
594 					v.z *= Float4(1.0f / 0x3FF);
595 					v.w *= Float4(1.0f / 0x3);
596 				}
597 			}
598 			break;
599 		default:
600 			ASSERT(false);
601 		}
602 
603 		if(stream.count < 1) v.x = Float4(0.0f);
604 		if(stream.count < 2) v.y = Float4(0.0f);
605 		if(stream.count < 3) v.z = Float4(0.0f);
606 		if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(0));
607 
608 		return v;
609 	}
610 
postTransform()611 	void VertexRoutine::postTransform()
612 	{
613 		int pos = state.positionRegister;
614 
615 		// Backtransform
616 		if(state.preTransformed)
617 		{
618 			Float4 rhw = Float4(1.0f) / o[pos].w;
619 
620 			Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
621 			Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
622 			Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
623 			Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
624 
625 			o[pos].x = (o[pos].x - L) / W * rhw;
626 			o[pos].y = (o[pos].y - T) / H * rhw;
627 			o[pos].z = o[pos].z * rhw;
628 			o[pos].w = rhw;
629 		}
630 
631 		if(!halfIntegerCoordinates && !state.preTransformed)
632 		{
633 			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
634 			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
635 		}
636 
637 		if(state.superSampling)
638 		{
639 			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
640 			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
641 		}
642 	}
643 
writeCache(Pointer<Byte> & cacheLine)644 	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
645 	{
646 		Vector4f v;
647 
648 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
649 		{
650 			if(state.output[i].write)
651 			{
652 				v.x = o[i].x;
653 				v.y = o[i].y;
654 				v.z = o[i].z;
655 				v.w = o[i].w;
656 
657 				if(state.output[i].xClamp)
658 				{
659 					v.x = Max(v.x, Float4(0.0f));
660 					v.x = Min(v.x, Float4(1.0f));
661 				}
662 
663 				if(state.output[i].yClamp)
664 				{
665 					v.y = Max(v.y, Float4(0.0f));
666 					v.y = Min(v.y, Float4(1.0f));
667 				}
668 
669 				if(state.output[i].zClamp)
670 				{
671 					v.z = Max(v.z, Float4(0.0f));
672 					v.z = Min(v.z, Float4(1.0f));
673 				}
674 
675 				if(state.output[i].wClamp)
676 				{
677 					v.w = Max(v.w, Float4(0.0f));
678 					v.w = Min(v.w, Float4(1.0f));
679 				}
680 
681 				if(state.output[i].write == 0x01)
682 				{
683 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
684 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
685 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
686 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
687 				}
688 				else
689 				{
690 					if(state.output[i].write == 0x03)
691 					{
692 						transpose2x4(v.x, v.y, v.z, v.w);
693 					}
694 					else
695 					{
696 						transpose4x4(v.x, v.y, v.z, v.w);
697 					}
698 
699 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
700 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
701 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
702 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
703 				}
704 			}
705 		}
706 
707 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
708 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
709 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
710 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
711 
712 		// Viewport transform
713 		int pos = state.positionRegister;
714 
715 		v.x = o[pos].x;
716 		v.y = o[pos].y;
717 		v.z = o[pos].z;
718 		v.w = o[pos].w;
719 
720 		if(symmetricNormalizedDepth)
721 		{
722 			v.z = (v.z + v.w) * Float4(0.5f);   // [-1, 1] -> [0, 1]
723 		}
724 
725 		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
726 		Float4 rhw = Float4(1.0f) / w;
727 
728 		v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
729 		v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
730 		v.z = v.z * rhw;
731 		v.w = rhw;
732 
733 		transpose4x4(v.x, v.y, v.z, v.w);
734 
735 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
736 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
737 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
738 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
739 	}
740 
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cache)741 	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
742 	{
743 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
744 		{
745 			if(state.output[i].write)
746 			{
747 				*Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
748 			}
749 		}
750 
751 		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
752 		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
753 	}
754 
transformFeedback(const Pointer<Byte> & vertex,const UInt & primitiveNumber,const UInt & indexInPrimitive)755 	void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
756 	{
757 		If(indexInPrimitive < state.verticesPerPrimitive)
758 		{
759 			UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
760 
761 			for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
762 			{
763 				if(state.transformFeedbackEnabled & (1ULL << i))
764 				{
765 					UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
766 					UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
767 					UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
768 					UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
769 
770 					Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
771 					Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
772 
773 					For(UInt r = 0, r < row, r++)
774 					{
775 						UInt rOffsetX = r * col * sizeof(float);
776 						UInt rOffset4 = r * sizeof(float4);
777 
778 						For(UInt c = 0, c < col, c++)
779 						{
780 							UInt cOffset = c * sizeof(float);
781 							*Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
782 						}
783 					}
784 				}
785 			}
786 		}
787 	}
788 }
789