1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28 
29 #include <vector>
30 #include <algorithm>
31 
32 #include "rasterizer.h"
33 #include "backends/gen_rasterizer.hpp"
34 #include "rdtsc_core.h"
35 #include "backend.h"
36 #include "utils.h"
37 #include "frontend.h"
38 #include "tilemgr.h"
39 #include "memory/tilingtraits.h"
40 #include "rasterizer_impl.h"
41 
42 PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
43 
RasterizeLine(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)44 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
45 {
46     SWR_CONTEXT *pContext = pDC->pContext;
47     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
48 #if KNOB_ENABLE_TOSS_POINTS
49     if (KNOB_TOSS_BIN_TRIS)
50     {
51         return;
52     }
53 #endif
54 
55     // bloat line to two tris and call the triangle rasterizer twice
56     AR_BEGIN(BERasterizeLine, pDC->drawId);
57 
58     const API_STATE &state = GetApiState(pDC);
59     const SWR_RASTSTATE &rastState = state.rastState;
60 
61     // macrotile dimensioning
62     uint32_t macroX, macroY;
63     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
64     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
65     int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
66     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
67     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
68 
69     const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
70 
71     // create a copy of the triangle buffer to write our adjusted vertices to
72     OSALIGNSIMD(float) newTriBuffer[4 * 4];
73     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
74     newWorkDesc.pTriBuffer = &newTriBuffer[0];
75 
76     // create a copy of the attrib buffer to write our adjusted attribs to
77     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
78     newWorkDesc.pAttribs = &newAttribBuffer[0];
79 
80     const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
81     const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
82 
83     __m128 vX, vY, vZ, vRecipW;
84 
85     vX = _mm_load_ps(workDesc.pTriBuffer);
86     vY = _mm_load_ps(workDesc.pTriBuffer + 4);
87     vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
88     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
89 
90     // triangle 0
91     // v0,v1 -> v0,v0,v1
92     __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
93     __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
94     __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
95     __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
96 
97     __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
98     __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
99     if (workDesc.triFlags.yMajor)
100     {
101         vXa = _mm_add_ps(vAdjust, vXa);
102     }
103     else
104     {
105         vYa = _mm_add_ps(vAdjust, vYa);
106     }
107 
108     // Store triangle description for rasterizer
109     _mm_store_ps((float*)&newTriBuffer[0], vXa);
110     _mm_store_ps((float*)&newTriBuffer[4], vYa);
111     _mm_store_ps((float*)&newTriBuffer[8], vZa);
112     _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
113 
114     // binner bins 3 edges for lines as v0, v1, v1
115     // tri0 needs v0, v0, v1
116     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
117     {
118         __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
119         __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
120 
121         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
122         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
123         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
124     }
125 
126     // Store user clip distances for triangle 0
127     float newClipBuffer[3 * 8];
128     uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
129     if (numClipDist)
130     {
131         newWorkDesc.pUserClipBuffer = newClipBuffer;
132 
133         float* pOldBuffer = workDesc.pUserClipBuffer;
134         float* pNewBuffer = newClipBuffer;
135         for (uint32_t i = 0; i < numClipDist; ++i)
136         {
137             // read barycentric coeffs from binner
138             float a = *(pOldBuffer++);
139             float b = *(pOldBuffer++);
140 
141             // reconstruct original clip distance at vertices
142             float c0 = a + b;
143             float c1 = b;
144 
145             // construct triangle barycentrics
146             *(pNewBuffer++) = c0 - c1;
147             *(pNewBuffer++) = c0 - c1;
148             *(pNewBuffer++) = c1;
149         }
150     }
151 
152     // setup triangle rasterizer function
153     PFN_WORK_FUNC pfnTriRast;
154     // conservative rast not supported for points/lines
155     pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
156         SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
157 
158     // make sure this macrotile intersects the triangle
159     __m128i vXai = fpToFixedPoint(vXa);
160     __m128i vYai = fpToFixedPoint(vYa);
161     OSALIGNSIMD(SWR_RECT) bboxA;
162     calcBoundingBoxInt(vXai, vYai, bboxA);
163 
164     if (!(bboxA.xmin > macroBoxRight ||
165         bboxA.xmin > scissorInFixedPoint.xmax ||
166         bboxA.xmax - 1 < macroBoxLeft ||
167         bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
168         bboxA.ymin > macroBoxBottom ||
169         bboxA.ymin > scissorInFixedPoint.ymax ||
170         bboxA.ymax - 1 < macroBoxTop ||
171         bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
172         // rasterize triangle
173         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
174     }
175 
176     // triangle 1
177     // v0,v1 -> v1,v1,v0
178     vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
179     vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
180     vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
181     vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
182 
183     vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
184     if (workDesc.triFlags.yMajor)
185     {
186         vXa = _mm_add_ps(vAdjust, vXa);
187     }
188     else
189     {
190         vYa = _mm_add_ps(vAdjust, vYa);
191     }
192 
193     // Store triangle description for rasterizer
194     _mm_store_ps((float*)&newTriBuffer[0], vXa);
195     _mm_store_ps((float*)&newTriBuffer[4], vYa);
196     _mm_store_ps((float*)&newTriBuffer[8], vZa);
197     _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
198 
199     // binner bins 3 edges for lines as v0, v1, v1
200     // tri1 needs v1, v1, v0
201     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
202     {
203         __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
204         __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
205 
206         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
207         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
208         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
209     }
210 
211     // store user clip distance for triangle 1
212     if (numClipDist)
213     {
214         float* pOldBuffer = workDesc.pUserClipBuffer;
215         float* pNewBuffer = newClipBuffer;
216         for (uint32_t i = 0; i < numClipDist; ++i)
217         {
218             // read barycentric coeffs from binner
219             float a = *(pOldBuffer++);
220             float b = *(pOldBuffer++);
221 
222             // reconstruct original clip distance at vertices
223             float c0 = a + b;
224             float c1 = b;
225 
226             // construct triangle barycentrics
227             *(pNewBuffer++) = c1 - c0;
228             *(pNewBuffer++) = c1 - c0;
229             *(pNewBuffer++) = c0;
230         }
231     }
232 
233     vXai = fpToFixedPoint(vXa);
234     vYai = fpToFixedPoint(vYa);
235     calcBoundingBoxInt(vXai, vYai, bboxA);
236 
237     if (!(bboxA.xmin > macroBoxRight ||
238         bboxA.xmin > scissorInFixedPoint.xmax ||
239         bboxA.xmax - 1 < macroBoxLeft ||
240         bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
241         bboxA.ymin > macroBoxBottom ||
242         bboxA.ymin > scissorInFixedPoint.ymax ||
243         bboxA.ymax - 1 < macroBoxTop ||
244         bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
245         // rasterize triangle
246         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
247     }
248 
249     AR_END(BERasterizeLine, 1);
250 }
251 
RasterizeSimplePoint(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)252 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
253 {
254     SWR_CONTEXT *pContext = pDC->pContext;
255 
256 #if KNOB_ENABLE_TOSS_POINTS
257     if (KNOB_TOSS_BIN_TRIS)
258     {
259         return;
260     }
261 #endif
262 
263     const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
264     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
265 
266     // map x,y relative offsets from start of raster tile to bit position in
267     // coverage mask for the point
268     static const uint32_t coverageMap[8][8] = {
269         { 0, 1, 4, 5, 8, 9, 12, 13 },
270         { 2, 3, 6, 7, 10, 11, 14, 15 },
271         { 16, 17, 20, 21, 24, 25, 28, 29 },
272         { 18, 19, 22, 23, 26, 27, 30, 31 },
273         { 32, 33, 36, 37, 40, 41, 44, 45 },
274         { 34, 35, 38, 39, 42, 43, 46, 47 },
275         { 48, 49, 52, 53, 56, 57, 60, 61 },
276         { 50, 51, 54, 55, 58, 59, 62, 63 }
277     };
278 
279     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
280 
281     // pull point information from triangle buffer
282     // @todo use structs for readability
283     uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
284     uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
285     float z = *(workDesc.pTriBuffer + 2);
286 
287     // construct triangle descriptor for point
288     // no interpolation, set up i,j for constant interpolation of z and attribs
289     // @todo implement an optimized backend that doesn't require triangle information
290 
291     // compute coverage mask from x,y packed into the coverageMask flag
292     // mask indices by the maximum valid index for x/y of coveragemap.
293     uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
294     uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
295     // todo: multisample points?
296     triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
297 
298     // no persp divide needed for points
299     triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
300     triDesc.triFlags = workDesc.triFlags;
301     triDesc.recipDet = 1.0f;
302     triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
303     triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
304     triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
305     triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
306 
307     RenderOutputBuffers renderBuffers;
308     GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
309         renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
310 
311     AR_BEGIN(BEPixelBackend, pDC->drawId);
312     backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
313     AR_END(BEPixelBackend, 0);
314 }
315 
RasterizeTriPoint(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)316 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
317 {
318     const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
319     const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
320     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
321 
322     bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
323 
324     // load point vertex
325     float x = *workDesc.pTriBuffer;
326     float y = *(workDesc.pTriBuffer + 1);
327     float z = *(workDesc.pTriBuffer + 2);
328 
329     // create a copy of the triangle buffer to write our adjusted vertices to
330     OSALIGNSIMD(float) newTriBuffer[4 * 4];
331     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
332     newWorkDesc.pTriBuffer = &newTriBuffer[0];
333 
334     // create a copy of the attrib buffer to write our adjusted attribs to
335     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
336     newWorkDesc.pAttribs = &newAttribBuffer[0];
337 
338     newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
339     newWorkDesc.numAttribs = workDesc.numAttribs;
340     newWorkDesc.triFlags = workDesc.triFlags;
341 
342     // construct two tris by bloating point by point size
343     float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
344     float lowerX = x - halfPointSize;
345     float upperX = x + halfPointSize;
346     float lowerY = y - halfPointSize;
347     float upperY = y + halfPointSize;
348 
349     // tri 0
350     float *pBuf = &newTriBuffer[0];
351     *pBuf++ = lowerX;
352     *pBuf++ = lowerX;
353     *pBuf++ = upperX;
354     pBuf++;
355     *pBuf++ = lowerY;
356     *pBuf++ = upperY;
357     *pBuf++ = upperY;
358     pBuf++;
359     _mm_store_ps(pBuf, _mm_set1_ps(z));
360     _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
361 
362     // setup triangle rasterizer function
363     PFN_WORK_FUNC pfnTriRast;
364     // conservative rast not supported for points/lines
365     pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
366         SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
367 
368     // overwrite texcoords for point sprites
369     if (isPointSpriteTexCoordEnabled)
370     {
371         // copy original attribs
372         memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
373         newWorkDesc.pAttribs = &newAttribBuffer[0];
374 
375         // overwrite texcoord for point sprites
376         uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
377         DWORD texCoordAttrib = 0;
378 
379         while (_BitScanForward(&texCoordAttrib, texCoordMask))
380         {
381             texCoordMask &= ~(1 << texCoordAttrib);
382             __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
383             if (rastState.pointSpriteTopOrigin)
384             {
385                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
386                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
387                 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
388             }
389             else
390             {
391                 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
392                 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
393                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
394             }
395         }
396     }
397     else
398     {
399         // no texcoord overwrite, can reuse the attrib buffer from frontend
400         newWorkDesc.pAttribs = workDesc.pAttribs;
401     }
402 
403     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
404 
405     // tri 1
406     pBuf = &newTriBuffer[0];
407     *pBuf++ = lowerX;
408     *pBuf++ = upperX;
409     *pBuf++ = upperX;
410     pBuf++;
411     *pBuf++ = lowerY;
412     *pBuf++ = upperY;
413     *pBuf++ = lowerY;
414     // z, w unchanged
415 
416     if (isPointSpriteTexCoordEnabled)
417     {
418         uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
419         DWORD texCoordAttrib = 0;
420 
421         while (_BitScanForward(&texCoordAttrib, texCoordMask))
422         {
423             texCoordMask &= ~(1 << texCoordAttrib);
424             __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
425             if (rastState.pointSpriteTopOrigin)
426             {
427                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
428                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
429                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
430 
431             }
432             else
433             {
434                 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
435                 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
436                 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
437             }
438         }
439     }
440 
441     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
442 }
443 
InitRasterizerFunctions()444 void InitRasterizerFunctions()
445 {
446     InitRasterizerFuncs();
447 }
448 
449 // Selector for correct templated RasterizeTriangle function
GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,bool IsCenter,bool IsConservative,SWR_INPUT_COVERAGE InputCoverage,uint32_t EdgeEnable,bool RasterizeScissorEdges)450 PFN_WORK_FUNC GetRasterizerFunc(
451     SWR_MULTISAMPLE_COUNT numSamples,
452     bool IsCenter,
453     bool IsConservative,
454     SWR_INPUT_COVERAGE InputCoverage,
455     uint32_t EdgeEnable,
456     bool RasterizeScissorEdges
457 )
458 {
459     SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
460     SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
461     SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
462 
463     PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges];
464     SWR_ASSERT(func);
465 
466     return func;
467 }
468