1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "VertexRoutine.hpp"
16 
17 #include "VertexShader.hpp"
18 #include "Vertex.hpp"
19 #include "Half.hpp"
20 #include "Renderer.hpp"
21 #include "Constants.hpp"
22 #include "Debug.hpp"
23 
24 namespace sw
25 {
26 	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
27 	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
28 
VertexRoutine(const VertexProcessor::State & state,const VertexShader * shader)29 	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
30 		: v(shader && shader->dynamicallyIndexedInput),
31 		  o(shader && shader->dynamicallyIndexedOutput),
32 		  state(state)
33 	{
34 	}
35 
~VertexRoutine()36 	VertexRoutine::~VertexRoutine()
37 	{
38 	}
39 
generate()40 	void VertexRoutine::generate()
41 	{
42 		const bool textureSampling = state.textureSampling;
43 
44 		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
45 		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
46 		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
47 
48 		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
49 		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
50 		UInt indexInPrimitive = 0;
51 
52 		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
53 
54 		Do
55 		{
56 			UInt index = *Pointer<UInt>(batch);
57 			UInt tagIndex = index & 0x0000003C;
58 			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
59 
60 			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
61 			{
62 				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
63 
64 				readInput(indexQ);
65 				pipeline();
66 				postTransform();
67 				computeClipFlags();
68 
69 				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
70 				writeCache(cacheLine0);
71 			}
72 
73 			UInt cacheIndex = index & 0x0000003F;
74 			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
75 			writeVertex(vertex, cacheLine);
76 
77 			if(state.transformFeedbackEnabled != 0)
78 			{
79 				transformFeedback(vertex, primitiveNumber, indexInPrimitive);
80 
81 				indexInPrimitive++;
82 				If(indexInPrimitive == 3)
83 				{
84 					primitiveNumber++;
85 					indexInPrimitive = 0;
86 				}
87 			}
88 
89 			vertex += sizeof(Vertex);
90 			batch += sizeof(unsigned int);
91 			vertexCount--;
92 		}
93 		Until(vertexCount == 0)
94 
95 		Return();
96 	}
97 
readInput(UInt & index)98 	void VertexRoutine::readInput(UInt &index)
99 	{
100 		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
101 		{
102 			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
103 			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
104 
105 			v[i] = readStream(input, stride, state.input[i], index);
106 		}
107 	}
108 
computeClipFlags()109 	void VertexRoutine::computeClipFlags()
110 	{
111 		int pos = state.positionRegister;
112 
113 		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
114 		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
115 		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
116 		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
117 		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
118 		Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);
119 
120 		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
121 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
122 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
123 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
124 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
125 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
126 
127 		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
128 		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
129 		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
130 
131 		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
132 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
133 
134 		if(state.preTransformed)
135 		{
136 			clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
137 		}
138 	}
139 
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,const UInt & index)140 	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
141 	{
142 		const bool textureSampling = state.textureSampling;
143 
144 		Vector4f v;
145 
146 		Pointer<Byte> source0 = buffer + index * stride;
147 		Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
148 		Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
149 		Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
150 
151 		switch(stream.type)
152 		{
153 		case STREAMTYPE_FLOAT:
154 			{
155 				if(stream.count == 0)
156 				{
157 					// Null stream, all default components
158 				}
159 				else if(stream.count == 1)
160 				{
161 					v.x.x = *Pointer<Float>(source0);
162 					v.x.y = *Pointer<Float>(source1);
163 					v.x.z = *Pointer<Float>(source2);
164 					v.x.w = *Pointer<Float>(source3);
165 				}
166 				else
167 				{
168 					v.x = *Pointer<Float4>(source0);
169 					v.y = *Pointer<Float4>(source1);
170 					v.z = *Pointer<Float4>(source2);
171 					v.w = *Pointer<Float4>(source3);
172 
173 					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
174 				}
175 			}
176 			break;
177 		case STREAMTYPE_BYTE:
178 			{
179 				v.x = Float4(*Pointer<Byte4>(source0));
180 				v.y = Float4(*Pointer<Byte4>(source1));
181 				v.z = Float4(*Pointer<Byte4>(source2));
182 				v.w = Float4(*Pointer<Byte4>(source3));
183 
184 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
185 
186 				if(stream.normalized)
187 				{
188 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
189 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
190 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
191 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
192 				}
193 			}
194 			break;
195 		case STREAMTYPE_SBYTE:
196 			{
197 				v.x = Float4(*Pointer<SByte4>(source0));
198 				v.y = Float4(*Pointer<SByte4>(source1));
199 				v.z = Float4(*Pointer<SByte4>(source2));
200 				v.w = Float4(*Pointer<SByte4>(source3));
201 
202 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
203 
204 				if(stream.normalized)
205 				{
206 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
207 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
208 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
209 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
210 				}
211 			}
212 			break;
213 		case STREAMTYPE_COLOR:
214 			{
215 				v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
216 				v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
217 				v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
218 				v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
219 
220 				transpose4x4(v.x, v.y, v.z, v.w);
221 
222 				// Swap red and blue
223 				Float4 t = v.x;
224 				v.x = v.z;
225 				v.z = t;
226 			}
227 			break;
228 		case STREAMTYPE_SHORT:
229 			{
230 				v.x = Float4(*Pointer<Short4>(source0));
231 				v.y = Float4(*Pointer<Short4>(source1));
232 				v.z = Float4(*Pointer<Short4>(source2));
233 				v.w = Float4(*Pointer<Short4>(source3));
234 
235 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
236 
237 				if(stream.normalized)
238 				{
239 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
240 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
241 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
242 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
243 				}
244 			}
245 			break;
246 		case STREAMTYPE_USHORT:
247 			{
248 				v.x = Float4(*Pointer<UShort4>(source0));
249 				v.y = Float4(*Pointer<UShort4>(source1));
250 				v.z = Float4(*Pointer<UShort4>(source2));
251 				v.w = Float4(*Pointer<UShort4>(source3));
252 
253 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
254 
255 				if(stream.normalized)
256 				{
257 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
258 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
259 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
260 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
261 				}
262 			}
263 			break;
264 		case STREAMTYPE_INT:
265 			{
266 				if(stream.normalized)
267 				{
268 					v.x = Float4(*Pointer<Int4>(source0));
269 					v.y = Float4(*Pointer<Int4>(source1));
270 					v.z = Float4(*Pointer<Int4>(source2));
271 					v.w = Float4(*Pointer<Int4>(source3));
272 
273 					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
274 
275 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
276 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
277 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
278 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
279 				}
280 				else
281 				{
282 					v.x = As<Float4>(*Pointer<Int4>(source0));
283 					v.y = As<Float4>(*Pointer<Int4>(source1));
284 					v.z = As<Float4>(*Pointer<Int4>(source2));
285 					v.w = As<Float4>(*Pointer<Int4>(source3));
286 
287 					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
288 				}
289 			}
290 			break;
291 		case STREAMTYPE_UINT:
292 			{
293 				if(stream.normalized)
294 				{
295 					v.x = Float4(*Pointer<UInt4>(source0));
296 					v.y = Float4(*Pointer<UInt4>(source1));
297 					v.z = Float4(*Pointer<UInt4>(source2));
298 					v.w = Float4(*Pointer<UInt4>(source3));
299 
300 					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
301 
302 					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
303 					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
304 					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
305 					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
306 				}
307 				else
308 				{
309 					v.x = As<Float4>(*Pointer<UInt4>(source0));
310 					v.y = As<Float4>(*Pointer<UInt4>(source1));
311 					v.z = As<Float4>(*Pointer<UInt4>(source2));
312 					v.w = As<Float4>(*Pointer<UInt4>(source3));
313 
314 					transpose4xN(v.x, v.y, v.z, v.w, stream.count);
315 				}
316 			}
317 			break;
318 		case STREAMTYPE_UDEC3:
319 			{
320 				// FIXME: Vectorize
321 				{
322 					Int x, y, z;
323 
324 					x = y = z = *Pointer<Int>(source0);
325 
326 					v.x.x = Float(x & 0x000003FF);
327 					v.x.y = Float(y & 0x000FFC00);
328 					v.x.z = Float(z & 0x3FF00000);
329 				}
330 
331 				{
332 					Int x, y, z;
333 
334 					x = y = z = *Pointer<Int>(source1);
335 
336 					v.y.x = Float(x & 0x000003FF);
337 					v.y.y = Float(y & 0x000FFC00);
338 					v.y.z = Float(z & 0x3FF00000);
339 				}
340 
341 				{
342 					Int x, y, z;
343 
344 					x = y = z = *Pointer<Int>(source2);
345 
346 					v.z.x = Float(x & 0x000003FF);
347 					v.z.y = Float(y & 0x000FFC00);
348 					v.z.z = Float(z & 0x3FF00000);
349 				}
350 
351 				{
352 					Int x, y, z;
353 
354 					x = y = z = *Pointer<Int>(source3);
355 
356 					v.w.x = Float(x & 0x000003FF);
357 					v.w.y = Float(y & 0x000FFC00);
358 					v.w.z = Float(z & 0x3FF00000);
359 				}
360 
361 				transpose4x3(v.x, v.y, v.z, v.w);
362 
363 				v.y *= Float4(1.0f / 0x00000400);
364 				v.z *= Float4(1.0f / 0x00100000);
365 			}
366 			break;
367 		case STREAMTYPE_DEC3N:
368 			{
369 				// FIXME: Vectorize
370 				{
371 					Int x, y, z;
372 
373 					x = y = z = *Pointer<Int>(source0);
374 
375 					v.x.x = Float((x << 22) & 0xFFC00000);
376 					v.x.y = Float((y << 12) & 0xFFC00000);
377 					v.x.z = Float((z << 2)  & 0xFFC00000);
378 				}
379 
380 				{
381 					Int x, y, z;
382 
383 					x = y = z = *Pointer<Int>(source1);
384 
385 					v.y.x = Float((x << 22) & 0xFFC00000);
386 					v.y.y = Float((y << 12) & 0xFFC00000);
387 					v.y.z = Float((z << 2)  & 0xFFC00000);
388 				}
389 
390 				{
391 					Int x, y, z;
392 
393 					x = y = z = *Pointer<Int>(source2);
394 
395 					v.z.x = Float((x << 22) & 0xFFC00000);
396 					v.z.y = Float((y << 12) & 0xFFC00000);
397 					v.z.z = Float((z << 2)  & 0xFFC00000);
398 				}
399 
400 				{
401 					Int x, y, z;
402 
403 					x = y = z = *Pointer<Int>(source3);
404 
405 					v.w.x = Float((x << 22) & 0xFFC00000);
406 					v.w.y = Float((y << 12) & 0xFFC00000);
407 					v.w.z = Float((z << 2)  & 0xFFC00000);
408 				}
409 
410 				transpose4x3(v.x, v.y, v.z, v.w);
411 
412 				v.x *= Float4(1.0f / 0x00400000 / 511.0f);
413 				v.y *= Float4(1.0f / 0x00400000 / 511.0f);
414 				v.z *= Float4(1.0f / 0x00400000 / 511.0f);
415 			}
416 			break;
417 		case STREAMTYPE_FIXED:
418 			{
419 				v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
420 				v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
421 				v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
422 				v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
423 
424 				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
425 			}
426 			break;
427 		case STREAMTYPE_HALF:
428 			{
429 				if(stream.count >= 1)
430 				{
431 					UShort x0 = *Pointer<UShort>(source0 + 0);
432 					UShort x1 = *Pointer<UShort>(source1 + 0);
433 					UShort x2 = *Pointer<UShort>(source2 + 0);
434 					UShort x3 = *Pointer<UShort>(source3 + 0);
435 
436 					v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
437 					v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
438 					v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
439 					v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
440 				}
441 
442 				if(stream.count >= 2)
443 				{
444 					UShort y0 = *Pointer<UShort>(source0 + 2);
445 					UShort y1 = *Pointer<UShort>(source1 + 2);
446 					UShort y2 = *Pointer<UShort>(source2 + 2);
447 					UShort y3 = *Pointer<UShort>(source3 + 2);
448 
449 					v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
450 					v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
451 					v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
452 					v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
453 				}
454 
455 				if(stream.count >= 3)
456 				{
457 					UShort z0 = *Pointer<UShort>(source0 + 4);
458 					UShort z1 = *Pointer<UShort>(source1 + 4);
459 					UShort z2 = *Pointer<UShort>(source2 + 4);
460 					UShort z3 = *Pointer<UShort>(source3 + 4);
461 
462 					v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
463 					v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
464 					v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
465 					v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
466 				}
467 
468 				if(stream.count >= 4)
469 				{
470 					UShort w0 = *Pointer<UShort>(source0 + 6);
471 					UShort w1 = *Pointer<UShort>(source1 + 6);
472 					UShort w2 = *Pointer<UShort>(source2 + 6);
473 					UShort w3 = *Pointer<UShort>(source3 + 6);
474 
475 					v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
476 					v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
477 					v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
478 					v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
479 				}
480 			}
481 			break;
482 		case STREAMTYPE_INDICES:
483 			{
484 				v.x.x = *Pointer<Float>(source0);
485 				v.x.y = *Pointer<Float>(source1);
486 				v.x.z = *Pointer<Float>(source2);
487 				v.x.w = *Pointer<Float>(source3);
488 			}
489 			break;
490 		case STREAMTYPE_2_10_10_10_INT:
491 			{
492 				Int4 src;
493 				src = Insert(src, *Pointer<Int>(source0), 0);
494 				src = Insert(src, *Pointer<Int>(source1), 1);
495 				src = Insert(src, *Pointer<Int>(source2), 2);
496 				src = Insert(src, *Pointer<Int>(source3), 3);
497 
498 				v.x = Float4((src << 22) >> 22);
499 				v.y = Float4((src << 12) >> 22);
500 				v.z = Float4((src << 02) >> 22);
501 				v.w = Float4(src >> 30);
502 
503 				if(stream.normalized)
504 				{
505 					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
506 					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
507 					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
508 					v.w = Max(v.w, Float4(-1.0f));
509 				}
510 			}
511 			break;
512 		case STREAMTYPE_2_10_10_10_UINT:
513 			{
514 				Int4 src;
515 				src = Insert(src, *Pointer<Int>(source0), 0);
516 				src = Insert(src, *Pointer<Int>(source1), 1);
517 				src = Insert(src, *Pointer<Int>(source2), 2);
518 				src = Insert(src, *Pointer<Int>(source3), 3);
519 
520 				v.x = Float4(src & Int4(0x3FF));
521 				v.y = Float4((src >> 10) & Int4(0x3FF));
522 				v.z = Float4((src >> 20) & Int4(0x3FF));
523 				v.w = Float4((src >> 30) & Int4(0x3));
524 
525 				if(stream.normalized)
526 				{
527 					v.x *= Float4(1.0f / 0x3FF);
528 					v.y *= Float4(1.0f / 0x3FF);
529 					v.z *= Float4(1.0f / 0x3FF);
530 					v.w *= Float4(1.0f / 0x3);
531 				}
532 			}
533 			break;
534 		default:
535 			ASSERT(false);
536 		}
537 
538 		if(stream.count < 1) v.x = Float4(0.0f);
539 		if(stream.count < 2) v.y = Float4(0.0f);
540 		if(stream.count < 3) v.z = Float4(0.0f);
541 		if(stream.count < 4) v.w = Float4(1.0f);
542 
543 		return v;
544 	}
545 
postTransform()546 	void VertexRoutine::postTransform()
547 	{
548 		int pos = state.positionRegister;
549 
550 		// Backtransform
551 		if(state.preTransformed)
552 		{
553 			Float4 rhw = Float4(1.0f) / o[pos].w;
554 
555 			Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
556 			Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
557 			Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
558 			Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
559 
560 			o[pos].x = (o[pos].x - L) / W * rhw;
561 			o[pos].y = (o[pos].y - T) / H * rhw;
562 			o[pos].z = o[pos].z * rhw;
563 			o[pos].w = rhw;
564 		}
565 
566 		if(!halfIntegerCoordinates && !state.preTransformed)
567 		{
568 			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
569 			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
570 		}
571 
572 		if(state.superSampling)
573 		{
574 			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
575 			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
576 		}
577 	}
578 
writeCache(Pointer<Byte> & cacheLine)579 	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
580 	{
581 		Vector4f v;
582 
583 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
584 		{
585 			if(state.output[i].write)
586 			{
587 				v.x = o[i].x;
588 				v.y = o[i].y;
589 				v.z = o[i].z;
590 				v.w = o[i].w;
591 
592 				if(state.output[i].xClamp)
593 				{
594 					v.x = Max(v.x, Float4(0.0f));
595 					v.x = Min(v.x, Float4(1.0f));
596 				}
597 
598 				if(state.output[i].yClamp)
599 				{
600 					v.y = Max(v.y, Float4(0.0f));
601 					v.y = Min(v.y, Float4(1.0f));
602 				}
603 
604 				if(state.output[i].zClamp)
605 				{
606 					v.z = Max(v.z, Float4(0.0f));
607 					v.z = Min(v.z, Float4(1.0f));
608 				}
609 
610 				if(state.output[i].wClamp)
611 				{
612 					v.w = Max(v.w, Float4(0.0f));
613 					v.w = Min(v.w, Float4(1.0f));
614 				}
615 
616 				if(state.output[i].write == 0x01)
617 				{
618 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
619 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
620 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
621 					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
622 				}
623 				else
624 				{
625 					if(state.output[i].write == 0x02)
626 					{
627 						transpose2x4(v.x, v.y, v.z, v.w);
628 					}
629 					else
630 					{
631 						transpose4x4(v.x, v.y, v.z, v.w);
632 					}
633 
634 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
635 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
636 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
637 					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
638 				}
639 			}
640 		}
641 
642 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
643 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
644 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
645 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
646 
647 		// Viewport transform
648 		int pos = state.positionRegister;
649 
650 		v.x = o[pos].x;
651 		v.y = o[pos].y;
652 		v.z = o[pos].z;
653 		v.w = o[pos].w;
654 
655 		if(symmetricNormalizedDepth)
656 		{
657 			v.z = (v.z + v.w) * Float4(0.5f);   // [-1, 1] -> [0, 1]
658 		}
659 
660 		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
661 		Float4 rhw = Float4(1.0f) / w;
662 
663 		v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
664 		v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
665 		v.z = v.z * rhw;
666 		v.w = rhw;
667 
668 		transpose4x4(v.x, v.y, v.z, v.w);
669 
670 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
671 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
672 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
673 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
674 	}
675 
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cache)676 	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
677 	{
678 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
679 		{
680 			if(state.output[i].write)
681 			{
682 				*Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
683 			}
684 		}
685 
686 		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
687 		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
688 	}
689 
transformFeedback(const Pointer<Byte> & vertex,const UInt & primitiveNumber,const UInt & indexInPrimitive)690 	void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
691 	{
692 		If(indexInPrimitive < state.verticesPerPrimitive)
693 		{
694 			UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
695 
696 			for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
697 			{
698 				if(state.transformFeedbackEnabled & (1ULL << i))
699 				{
700 					UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
701 					UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
702 					UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
703 					UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
704 
705 					Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
706 					Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
707 
708 					For(UInt r = 0, r < row, r++)
709 					{
710 						UInt rOffsetX = r * col * sizeof(float);
711 						UInt rOffset4 = r * sizeof(float4);
712 
713 						For(UInt c = 0, c < col, c++)
714 						{
715 							UInt cOffset = c * sizeof(float);
716 							*Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
717 						}
718 					}
719 				}
720 			}
721 		}
722 	}
723 }
724