1 
2 /* FF is big and ugly so feel free to write lines as long as you like.
3  * Aieeeeeeeee !
4  *
5  * Let me make that clearer:
6  * Aieeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ! !! !!!
7  */
8 
9 #include "device9.h"
10 #include "basetexture9.h"
11 #include "vertexdeclaration9.h"
12 #include "vertexshader9.h"
13 #include "pixelshader9.h"
14 #include "nine_ff.h"
15 #include "nine_defines.h"
16 #include "nine_helpers.h"
17 #include "nine_pipe.h"
18 #include "nine_dump.h"
19 
20 #include "pipe/p_context.h"
21 #include "tgsi/tgsi_ureg.h"
22 #include "tgsi/tgsi_dump.h"
23 #include "util/u_box.h"
24 #include "util/u_hash_table.h"
25 #include "util/u_upload_mgr.h"
26 
27 #define DBG_CHANNEL DBG_FF
28 
29 #define NINE_FF_NUM_VS_CONST 196
30 #define NINE_FF_NUM_PS_CONST 24
31 
32 struct fvec4
33 {
34     float x, y, z, w;
35 };
36 
37 struct nine_ff_vs_key
38 {
39     union {
40         struct {
41             uint32_t position_t : 1;
42             uint32_t lighting   : 1;
43             uint32_t darkness   : 1; /* lighting enabled but no active lights */
44             uint32_t localviewer : 1;
45             uint32_t vertexpointsize : 1;
46             uint32_t pointscale : 1;
47             uint32_t vertexblend : 3;
48             uint32_t vertexblend_indexed : 1;
49             uint32_t vertextween : 1;
50             uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
51             uint32_t mtl_ambient : 2;
52             uint32_t mtl_specular : 2;
53             uint32_t mtl_emissive : 2;
54             uint32_t fog_mode : 2;
55             uint32_t fog_range : 1;
56             uint32_t color0in_one : 1;
57             uint32_t color1in_zero : 1;
58             uint32_t has_normal : 1;
59             uint32_t fog : 1;
60             uint32_t normalizenormals : 1;
61             uint32_t ucp : 1;
62             uint32_t pad1 : 4;
63             uint32_t tc_dim_input: 16; /* 8 * 2 bits */
64             uint32_t pad2 : 16;
65             uint32_t tc_dim_output: 24; /* 8 * 3 bits */
66             uint32_t pad3 : 8;
67             uint32_t tc_gen : 24; /* 8 * 3 bits */
68             uint32_t pad4 : 8;
69             uint32_t tc_idx : 24;
70             uint32_t pad5 : 8;
71             uint32_t passthrough;
72         };
73         uint64_t value64[3]; /* don't forget to resize VertexShader9.ff_key */
74         uint32_t value32[6];
75     };
76 };
77 
78 /* Texture stage state:
79  *
80  * COLOROP       D3DTOP 5 bit
81  * ALPHAOP       D3DTOP 5 bit
82  * COLORARG0     D3DTA  3 bit
83  * COLORARG1     D3DTA  3 bit
84  * COLORARG2     D3DTA  3 bit
85  * ALPHAARG0     D3DTA  3 bit
86  * ALPHAARG1     D3DTA  3 bit
87  * ALPHAARG2     D3DTA  3 bit
88  * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
89  * TEXCOORDINDEX 0 - 7  3 bit
90  * ===========================
91  *                     32 bit per stage
92  */
93 struct nine_ff_ps_key
94 {
95     union {
96         struct {
97             struct {
98                 uint32_t colorop   : 5;
99                 uint32_t alphaop   : 5;
100                 uint32_t colorarg0 : 3;
101                 uint32_t colorarg1 : 3;
102                 uint32_t colorarg2 : 3;
103                 uint32_t alphaarg0 : 3;
104                 uint32_t alphaarg1 : 3;
105                 uint32_t alphaarg2 : 3;
106                 uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
107                 uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
108                 uint32_t pad       : 1;
109                 /* that's 32 bit exactly */
110             } ts[8];
111             uint32_t projected : 16;
112             uint32_t fog : 1; /* for vFog coming from VS */
113             uint32_t fog_mode : 2;
114             uint32_t fog_source : 1; /* 0: Z, 1: W */
115             uint32_t specular : 1;
116             uint32_t pad1 : 11; /* 9 32-bit words with this */
117             uint8_t colorarg_b4[3];
118             uint8_t colorarg_b5[3];
119             uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
120             uint8_t pad2[3];
121         };
122         uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
123         uint32_t value32[12];
124     };
125 };
126 
nine_ff_vs_key_hash(void * key)127 static unsigned nine_ff_vs_key_hash(void *key)
128 {
129     struct nine_ff_vs_key *vs = key;
130     unsigned i;
131     uint32_t hash = vs->value32[0];
132     for (i = 1; i < ARRAY_SIZE(vs->value32); ++i)
133         hash ^= vs->value32[i];
134     return hash;
135 }
nine_ff_vs_key_comp(void * key1,void * key2)136 static int nine_ff_vs_key_comp(void *key1, void *key2)
137 {
138     struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
139     struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;
140 
141     return memcmp(a->value64, b->value64, sizeof(a->value64));
142 }
nine_ff_ps_key_hash(void * key)143 static unsigned nine_ff_ps_key_hash(void *key)
144 {
145     struct nine_ff_ps_key *ps = key;
146     unsigned i;
147     uint32_t hash = ps->value32[0];
148     for (i = 1; i < ARRAY_SIZE(ps->value32); ++i)
149         hash ^= ps->value32[i];
150     return hash;
151 }
nine_ff_ps_key_comp(void * key1,void * key2)152 static int nine_ff_ps_key_comp(void *key1, void *key2)
153 {
154     struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
155     struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;
156 
157     return memcmp(a->value64, b->value64, sizeof(a->value64));
158 }
nine_ff_fvf_key_hash(void * key)159 static unsigned nine_ff_fvf_key_hash(void *key)
160 {
161     return *(DWORD *)key;
162 }
nine_ff_fvf_key_comp(void * key1,void * key2)163 static int nine_ff_fvf_key_comp(void *key1, void *key2)
164 {
165     return *(DWORD *)key1 != *(DWORD *)key2;
166 }
167 
168 static void nine_ff_prune_vs(struct NineDevice9 *);
169 static void nine_ff_prune_ps(struct NineDevice9 *);
170 
nine_ureg_tgsi_dump(struct ureg_program * ureg,boolean override)171 static void nine_ureg_tgsi_dump(struct ureg_program *ureg, boolean override)
172 {
173     if (debug_get_bool_option("NINE_FF_DUMP", FALSE) || override) {
174         const struct tgsi_token *toks = ureg_get_tokens(ureg, NULL);
175         tgsi_dump(toks, 0);
176         ureg_free_tokens(toks);
177     }
178 }
179 
180 #define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
181 #define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
182 #define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
183 #define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)
184 
185 #define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
186 #define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
187 #define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
188 #define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)
189 
190 #define _XYZW(r) (r)
191 
192 /* AL should contain base address of lights table. */
193 #define LIGHT_CONST(i)                                                \
194     ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))
195 
196 #define MATERIAL_CONST(i) \
197     ureg_DECL_constant(ureg, 19 + (i))
198 
199 #define _CONST(n) ureg_DECL_constant(ureg, n)
200 
201 /* VS FF constants layout:
202  *
203  * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
204  * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
205  * CONST[ 8..11] D3DTS_PROJECTION
206  * CONST[12..15] D3DTS_VIEW^(-1)
207  * CONST[16..18] Normal matrix
208  *
209  * CONST[19].xyz  MATERIAL.Emissive + Material.Ambient * RS.Ambient
210  * CONST[20]      MATERIAL.Diffuse
211  * CONST[21]      MATERIAL.Ambient
212  * CONST[22]      MATERIAL.Specular
213  * CONST[23].x___ MATERIAL.Power
214  * CONST[24]      MATERIAL.Emissive
215  * CONST[25]      RS.Ambient
216  *
217  * CONST[26].x___ RS.PointSizeMin
218  * CONST[26]._y__ RS.PointSizeMax
219  * CONST[26].__z_ RS.PointSize
220  * CONST[26].___w RS.PointScaleA
221  * CONST[27].x___ RS.PointScaleB
222  * CONST[27]._y__ RS.PointScaleC
223  *
224  * CONST[28].x___ RS.FogEnd
225  * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
226  * CONST[28].__z_ RS.FogDensity
227 
228  * CONST[30].x___ TWEENFACTOR
229  *
230  * CONST[32].x___ LIGHT[0].Type
231  * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
232  * CONST[33]      LIGHT[0].Diffuse
233  * CONST[34]      LIGHT[0].Specular
234  * CONST[35]      LIGHT[0].Ambient
235  * CONST[36].xyz_ LIGHT[0].Position
236  * CONST[36].___w LIGHT[0].Range
237  * CONST[37].xyz_ LIGHT[0].Direction
238  * CONST[37].___w LIGHT[0].Falloff
239  * CONST[38].x___ cos(LIGHT[0].Theta / 2)
240  * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
241  * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
242  * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
243  * CONST[39].___w 1 if this is the last active light, 0 if not
244  * CONST[40]      LIGHT[1]
245  * CONST[48]      LIGHT[2]
246  * CONST[56]      LIGHT[3]
247  * CONST[64]      LIGHT[4]
248  * CONST[72]      LIGHT[5]
249  * CONST[80]      LIGHT[6]
250  * CONST[88]      LIGHT[7]
251  * NOTE: no lighting code is generated if there are no active lights
252  *
253  * CONST[100].x___ Viewport 2/width
254  * CONST[100]._y__ Viewport 2/height
255  * CONST[100].__z_ Viewport 1/(zmax - zmin)
256  * CONST[100].___w Viewport width
257  * CONST[101].x___ Viewport x0
258  * CONST[101]._y__ Viewport y0
259  * CONST[101].__z_ Viewport z0
260  *
261  * CONST[128..131] D3DTS_TEXTURE0
262  * CONST[132..135] D3DTS_TEXTURE1
263  * CONST[136..139] D3DTS_TEXTURE2
264  * CONST[140..143] D3DTS_TEXTURE3
265  * CONST[144..147] D3DTS_TEXTURE4
266  * CONST[148..151] D3DTS_TEXTURE5
267  * CONST[152..155] D3DTS_TEXTURE6
268  * CONST[156..159] D3DTS_TEXTURE7
269  *
270  * CONST[160] D3DTS_WORLDMATRIX[0] * D3DTS_VIEW
271  * CONST[164] D3DTS_WORLDMATRIX[1] * D3DTS_VIEW
272  * ...
273  * CONST[192] D3DTS_WORLDMATRIX[8] * D3DTS_VIEW
274  */
275 struct vs_build_ctx
276 {
277     struct ureg_program *ureg;
278     const struct nine_ff_vs_key *key;
279 
280     uint16_t input[PIPE_MAX_ATTRIBS];
281     unsigned num_inputs;
282 
283     struct ureg_src aVtx;
284     struct ureg_src aNrm;
285     struct ureg_src aCol[2];
286     struct ureg_src aTex[8];
287     struct ureg_src aPsz;
288     struct ureg_src aInd;
289     struct ureg_src aWgt;
290 
291     struct ureg_src aVtx1; /* tweening */
292     struct ureg_src aNrm1;
293 
294     struct ureg_src mtlA;
295     struct ureg_src mtlD;
296     struct ureg_src mtlS;
297     struct ureg_src mtlE;
298 };
299 
300 static inline unsigned
get_texcoord_sn(struct pipe_screen * screen)301 get_texcoord_sn(struct pipe_screen *screen)
302 {
303     if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
304         return TGSI_SEMANTIC_TEXCOORD;
305     return TGSI_SEMANTIC_GENERIC;
306 }
307 
308 static inline struct ureg_src
build_vs_add_input(struct vs_build_ctx * vs,uint16_t ndecl)309 build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
310 {
311     const unsigned i = vs->num_inputs++;
312     assert(i < PIPE_MAX_ATTRIBS);
313     vs->input[i] = ndecl;
314     return ureg_DECL_vs_input(vs->ureg, i);
315 }
316 
317 /* NOTE: dst may alias src */
318 static inline void
ureg_normalize3(struct ureg_program * ureg,struct ureg_dst dst,struct ureg_src src)319 ureg_normalize3(struct ureg_program *ureg,
320                 struct ureg_dst dst, struct ureg_src src)
321 {
322     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
323     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
324 
325     ureg_DP3(ureg, tmp_x, src, src);
326     ureg_RSQ(ureg, tmp_x, _X(tmp));
327     ureg_MUL(ureg, dst, src, _X(tmp));
328     ureg_release_temporary(ureg, tmp);
329 }
330 
331 static void *
nine_ff_build_vs(struct NineDevice9 * device,struct vs_build_ctx * vs)332 nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
333 {
334     const struct nine_ff_vs_key *key = vs->key;
335     struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
336     struct ureg_dst oPos, oCol[2], oPsz, oFog;
337     struct ureg_dst AR;
338     unsigned i, c;
339     unsigned label[32], l = 0;
340     boolean need_aNrm = key->lighting || key->passthrough & (1 << NINE_DECLUSAGE_NORMAL);
341     boolean has_aNrm = need_aNrm && key->has_normal;
342     boolean need_aVtx = key->lighting || key->fog_mode || key->pointscale || key->ucp;
343     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
344 
345     vs->ureg = ureg;
346 
347     /* Check which inputs we should transform. */
348     for (i = 0; i < 8 * 3; i += 3) {
349         switch ((key->tc_gen >> i) & 0x7) {
350         case NINED3DTSS_TCI_CAMERASPACENORMAL:
351             need_aNrm = TRUE;
352             break;
353         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
354             need_aVtx = TRUE;
355             break;
356         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
357             need_aVtx = need_aNrm = TRUE;
358             break;
359         case NINED3DTSS_TCI_SPHEREMAP:
360             need_aVtx = need_aNrm = TRUE;
361             break;
362         default:
363             break;
364         }
365     }
366 
367     /* Declare and record used inputs (needed for linkage with vertex format):
368      * (texture coordinates handled later)
369      */
370     vs->aVtx = build_vs_add_input(vs,
371         key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);
372 
373     vs->aNrm = ureg_imm1f(ureg, 0.0f);
374     if (has_aNrm)
375         vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);
376 
377     vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
378     vs->aCol[1] = ureg_imm1f(ureg, 0.0f);
379 
380     if (key->lighting || key->darkness) {
381         const unsigned mask = key->mtl_diffuse | key->mtl_specular |
382                               key->mtl_ambient | key->mtl_emissive;
383         if ((mask & 0x1) && !key->color0in_one)
384             vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
385         if ((mask & 0x2) && !key->color1in_zero)
386             vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
387 
388         vs->mtlD = MATERIAL_CONST(1);
389         vs->mtlA = MATERIAL_CONST(2);
390         vs->mtlS = MATERIAL_CONST(3);
391         vs->mtlE = MATERIAL_CONST(5);
392         if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
393         if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
394         if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
395         if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
396         if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
397         if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
398         if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
399         if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
400     } else {
401         if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
402         if (!key->color1in_zero) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
403     }
404 
405     if (key->vertexpointsize)
406         vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
407 
408     if (key->vertexblend_indexed || key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES))
409         vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
410     if (key->vertexblend || key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT))
411         vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
412     if (key->vertextween) {
413         vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
414         vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
415     }
416 
417     /* Declare outputs:
418      */
419     oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
420     oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
421     oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
422     if (key->fog || key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
423         oFog = ureg_DECL_output(ureg, TGSI_SEMANTIC_FOG, 0);
424         oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
425     }
426 
427     if (key->vertexpointsize || key->pointscale) {
428         oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
429                                        TGSI_WRITEMASK_X, 0, 1);
430         oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
431     }
432 
433     if (key->lighting || key->vertexblend)
434         AR = ureg_DECL_address(ureg);
435 
436     /* === Vertex transformation / vertex blending:
437      */
438 
439     if (key->position_t) {
440         if (device->driver_caps.window_space_position_support) {
441             ureg_MOV(ureg, oPos, vs->aVtx);
442         } else {
443             struct ureg_dst tmp = ureg_DECL_temporary(ureg);
444             /* vs->aVtx contains the coordinates buffer wise.
445             * later in the pipeline, clipping, viewport and division
446             * by w (rhw = 1/w) are going to be applied, so do the reverse
447             * of these transformations (except clipping) to have the good
448             * position at the end.*/
449             ureg_MOV(ureg, tmp, vs->aVtx);
450             /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
451             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), ureg_negate(_CONST(101)));
452             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
453             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
454             /* Y needs to be reversed */
455             ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
456             /* inverse rhw */
457             ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
458             /* multiply X, Y, Z by w */
459             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
460             ureg_MOV(ureg, oPos, ureg_src(tmp));
461             ureg_release_temporary(ureg, tmp);
462         }
463     } else if (key->vertexblend) {
464         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
465         struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
466         struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
467         struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
468         struct ureg_dst sum_blendweights = ureg_DECL_temporary(ureg);
469         struct ureg_src cWM[4];
470 
471         for (i = 160; i <= 195; ++i)
472             ureg_DECL_constant(ureg, i);
473 
474         /* translate world matrix index to constant file index */
475         if (key->vertexblend_indexed) {
476             ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 160.0f));
477             ureg_ARL(ureg, AR, ureg_src(tmp));
478         }
479 
480         ureg_MOV(ureg, aVtx_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
481         ureg_MOV(ureg, aNrm_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
482         ureg_MOV(ureg, sum_blendweights, ureg_imm4f(ureg, 1.0f, 1.0f, 1.0f, 1.0f));
483 
484         for (i = 0; i < key->vertexblend; ++i) {
485             for (c = 0; c < 4; ++c) {
486                 cWM[c] = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, (160 + i * 4) * !key->vertexblend_indexed + c), 0);
487                 if (key->vertexblend_indexed)
488                     cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
489             }
490 
491             /* multiply by WORLD(index) */
492             ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), cWM[0]);
493             ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), cWM[1], ureg_src(tmp));
494             ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), cWM[2], ureg_src(tmp));
495             ureg_MAD(ureg, tmp, _WWWW(vs->aVtx), cWM[3], ureg_src(tmp));
496 
497             if (has_aNrm) {
498                 /* Note: the spec says the transpose of the inverse of the
499                  * WorldView matrices should be used, but all tests show
500                  * otherwise.
501                  * Only case unknown: D3DVBF_0WEIGHTS */
502                 ureg_MUL(ureg, tmp2, _XXXX(vs->aNrm), cWM[0]);
503                 ureg_MAD(ureg, tmp2, _YYYY(vs->aNrm), cWM[1], ureg_src(tmp2));
504                 ureg_MAD(ureg, tmp2, _ZZZZ(vs->aNrm), cWM[2], ureg_src(tmp2));
505             }
506 
507             if (i < (key->vertexblend - 1)) {
508                 /* accumulate weighted position value */
509                 ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(vs->aWgt, i), ureg_src(aVtx_dst));
510                 if (has_aNrm)
511                     ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(vs->aWgt, i), ureg_src(aNrm_dst));
512                 /* subtract weighted position value for last value */
513                 ureg_ADD(ureg, sum_blendweights, ureg_src(sum_blendweights), ureg_negate(ureg_scalar(vs->aWgt, i)));
514             }
515         }
516 
517         /* the last weighted position is always 1 - sum_of_previous_weights */
518         ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aVtx_dst));
519         if (has_aNrm)
520             ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aNrm_dst));
521 
522         /* multiply by VIEW_PROJ */
523         ureg_MUL(ureg, tmp, _X(aVtx_dst), _CONST(8));
524         ureg_MAD(ureg, tmp, _Y(aVtx_dst), _CONST(9),  ureg_src(tmp));
525         ureg_MAD(ureg, tmp, _Z(aVtx_dst), _CONST(10), ureg_src(tmp));
526         ureg_MAD(ureg, oPos, _W(aVtx_dst), _CONST(11), ureg_src(tmp));
527 
528         if (need_aVtx)
529             vs->aVtx = ureg_src(aVtx_dst);
530 
531         ureg_release_temporary(ureg, tmp);
532         ureg_release_temporary(ureg, tmp2);
533         ureg_release_temporary(ureg, sum_blendweights);
534         if (!need_aVtx)
535             ureg_release_temporary(ureg, aVtx_dst);
536 
537         if (has_aNrm) {
538             if (key->normalizenormals)
539                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
540             vs->aNrm = ureg_src(aNrm_dst);
541         } else
542             ureg_release_temporary(ureg, aNrm_dst);
543     } else {
544         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
545 
546         if (key->vertextween) {
547             struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
548             ureg_LRP(ureg, aVtx_dst, _XXXX(_CONST(30)), vs->aVtx1, vs->aVtx);
549             vs->aVtx = ureg_src(aVtx_dst);
550             if (has_aNrm) {
551                 struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
552                 ureg_LRP(ureg, aNrm_dst, _XXXX(_CONST(30)), vs->aNrm1, vs->aNrm);
553                 vs->aNrm = ureg_src(aNrm_dst);
554             }
555         }
556 
557         /* position = vertex * WORLD_VIEW_PROJ */
558         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(0));
559         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(1), ureg_src(tmp));
560         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(2), ureg_src(tmp));
561         ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(tmp));
562         ureg_release_temporary(ureg, tmp);
563 
564         if (need_aVtx) {
565             struct ureg_dst aVtx_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
566             ureg_MUL(ureg, aVtx_dst, _XXXX(vs->aVtx), _CONST(4));
567             ureg_MAD(ureg, aVtx_dst, _YYYY(vs->aVtx), _CONST(5), ureg_src(aVtx_dst));
568             ureg_MAD(ureg, aVtx_dst, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(aVtx_dst));
569             ureg_MAD(ureg, aVtx_dst, _WWWW(vs->aVtx), _CONST(7), ureg_src(aVtx_dst));
570             vs->aVtx = ureg_src(aVtx_dst);
571         }
572         if (has_aNrm) {
573             struct ureg_dst aNrm_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
574             ureg_MUL(ureg, aNrm_dst, _XXXX(vs->aNrm), _CONST(16));
575             ureg_MAD(ureg, aNrm_dst, _YYYY(vs->aNrm), _CONST(17), ureg_src(aNrm_dst));
576             ureg_MAD(ureg, aNrm_dst, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(aNrm_dst));
577             if (key->normalizenormals)
578                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
579             vs->aNrm = ureg_src(aNrm_dst);
580         }
581     }
582 
583     /* === Process point size:
584      */
585     if (key->vertexpointsize || key->pointscale) {
586         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
587         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
588         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
589         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
590         if (key->vertexpointsize) {
591             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
592             ureg_MAX(ureg, tmp_z, _XXXX(vs->aPsz), _XXXX(cPsz1));
593             ureg_MIN(ureg, tmp_z, _Z(tmp), _YYYY(cPsz1));
594         } else {
595             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
596             ureg_MOV(ureg, tmp_z, _ZZZZ(cPsz1));
597         }
598 
599         if (key->pointscale) {
600             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
601             struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
602 
603             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
604             ureg_RSQ(ureg, tmp_y, _X(tmp));
605             ureg_MUL(ureg, tmp_y, _Y(tmp), _X(tmp));
606             ureg_CMP(ureg, tmp_y, ureg_negate(_Y(tmp)), _Y(tmp), ureg_imm1f(ureg, 0.0f));
607             ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
608             ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
609             ureg_RSQ(ureg, tmp_x, _X(tmp));
610             ureg_MUL(ureg, tmp_x, _X(tmp), _Z(tmp));
611             ureg_MUL(ureg, tmp_x, _X(tmp), _WWWW(_CONST(100)));
612             ureg_MAX(ureg, tmp_x, _X(tmp), _XXXX(cPsz1));
613             ureg_MIN(ureg, tmp_z, _X(tmp), _YYYY(cPsz1));
614         }
615 
616         ureg_MOV(ureg, oPsz, _Z(tmp));
617         ureg_release_temporary(ureg, tmp);
618     }
619 
620     for (i = 0; i < 8; ++i) {
621         struct ureg_dst tmp, tmp_x, tmp2;
622         struct ureg_dst oTex, input_coord, transformed, t, aVtx_normed;
623         unsigned c, writemask;
624         const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
625         const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
626         unsigned dim_input = 1 + ((key->tc_dim_input >> (i * 2)) & 0x3);
627         const unsigned dim_output = (key->tc_dim_output >> (i * 3)) & 0x7;
628 
629         /* No texture output of index s */
630         if (tci == NINED3DTSS_TCI_DISABLE)
631             continue;
632         oTex = ureg_DECL_output(ureg, texcoord_sn, i);
633         tmp = ureg_DECL_temporary(ureg);
634         tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
635         input_coord = ureg_DECL_temporary(ureg);
636         transformed = ureg_DECL_temporary(ureg);
637 
638         /* Get the coordinate */
639         switch (tci) {
640         case NINED3DTSS_TCI_PASSTHRU:
641             /* NINED3DTSS_TCI_PASSTHRU => Use texcoord coming from index idx *
642              * Else the idx is used only to determine wrapping mode. */
643             vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
644             ureg_MOV(ureg, input_coord, vs->aTex[idx]);
645             break;
646         case NINED3DTSS_TCI_CAMERASPACENORMAL:
647             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aNrm);
648             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
649             dim_input = 4;
650             break;
651         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
652             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aVtx);
653             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
654             dim_input = 4;
655             break;
656         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
657             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
658             aVtx_normed = ureg_DECL_temporary(ureg);
659             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
660             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
661             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
662             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
663             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
664             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
665             ureg_release_temporary(ureg, aVtx_normed);
666             dim_input = 4;
667             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
668             break;
669         case NINED3DTSS_TCI_SPHEREMAP:
670             /* Implement the formula of GL_SPHERE_MAP */
671             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
672             aVtx_normed = ureg_DECL_temporary(ureg);
673             tmp2 = ureg_DECL_temporary(ureg);
674             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
675             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
676             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
677             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
678             ureg_ADD(ureg, tmp, ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
679             /* now tmp = normed(Vtx) - 2 dot3(normed(Vtx), Nrm) Nrm */
680             ureg_MOV(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_XYZ), ureg_src(tmp));
681             ureg_MUL(ureg, tmp2, ureg_src(tmp2), ureg_src(tmp2));
682             ureg_DP3(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_src(tmp2));
683             ureg_RSQ(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2));
684             ureg_MUL(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_imm1f(ureg, 0.5f));
685             /* tmp2 = 0.5 / sqrt(tmp.x^2 + tmp.y^2 + (tmp.z+1)^2)
686              * TODO: z coordinates are a bit different gl vs d3d, should the formula be adapted ? */
687             ureg_MUL(ureg, tmp, ureg_src(tmp), _X(tmp2));
688             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
689             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_ZW), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
690             ureg_release_temporary(ureg, aVtx_normed);
691             ureg_release_temporary(ureg, tmp2);
692             dim_input = 4;
693             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
694             break;
695         default:
696             assert(0);
697             break;
698         }
699 
700         /* Apply the transformation */
701         /* dim_output == 0 => do not transform the components.
702          * XYZRHW also disables transformation */
703         if (!dim_output || key->position_t) {
704             ureg_release_temporary(ureg, transformed);
705             transformed = input_coord;
706             writemask = TGSI_WRITEMASK_XYZW;
707         } else {
708             for (c = 0; c < dim_output; c++) {
709                 t = ureg_writemask(transformed, 1 << c);
710                 switch (dim_input) {
711                 /* dim_input = 1 2 3: -> we add trailing 1 to input*/
712                 case 1: ureg_MAD(ureg, t, _X(input_coord), _XXXX(_CONST(128 + i * 4 + c)), _YYYY(_CONST(128 + i * 4 + c)));
713                         break;
714                 case 2: ureg_DP2(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
715                         ureg_ADD(ureg, t, ureg_src(transformed), _ZZZZ(_CONST(128 + i * 4 + c)));
716                         break;
717                 case 3: ureg_DP3(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
718                         ureg_ADD(ureg, t, ureg_src(transformed), _WWWW(_CONST(128 + i * 4 + c)));
719                         break;
720                 case 4: ureg_DP4(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c)); break;
721                 default:
722                     assert(0);
723                 }
724             }
725             writemask = (1 << dim_output) - 1;
726             ureg_release_temporary(ureg, input_coord);
727         }
728 
729         ureg_MOV(ureg, ureg_writemask(oTex, writemask), ureg_src(transformed));
730         ureg_release_temporary(ureg, transformed);
731         ureg_release_temporary(ureg, tmp);
732     }
733 
734     /* === Lighting:
735      *
736      * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
737      * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
738      * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
739      *
740      * vec3 normal = normalize(in.Normal * NormalMatrix);
741      * vec3 hitDir = light.direction;
742      * float atten = 1.0;
743      *
744      * if (light.type != DIRECTIONAL)
745      * {
746      *     vec3 hitVec = light.position - eyeVertex;
747      *     float d = length(hitVec);
748      *     hitDir = hitVec / d;
749      *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
750      * }
751      *
752      * if (light.type == SPOTLIGHT)
753      * {
754      *     float rho = dp3(-hitVec, light.direction);
755      *     if (rho < cos(light.phi / 2))
756      *         atten = 0;
757      *     if (rho < cos(light.theta / 2))
758      *         atten *= pow(some_func(rho), light.falloff);
759      * }
760      *
761      * float nDotHit = dp3_sat(normal, hitVec);
762      * float powFact = 0.0;
763      *
764      * if (nDotHit > 0.0)
765      * {
766      *     vec3 midVec = normalize(hitDir + eye);
767      *     float nDotMid = dp3_sat(normal, midVec);
768      *     pFact = pow(nDotMid, material.power);
769      * }
770      *
771      * ambient += light.ambient * atten;
772      * diffuse += light.diffuse * atten * nDotHit;
773      * specular += light.specular * atten * powFact;
774      */
775     if (key->lighting) {
776         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
777         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
778         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
779         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
780         struct ureg_dst rAtt = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
781         struct ureg_dst rHit = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
782         struct ureg_dst rMid = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
783 
784         struct ureg_dst rCtr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
785 
786         struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);
787 
788         /* Light.*.Alpha is not used. */
789         struct ureg_dst rD = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
790         struct ureg_dst rA = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
791         struct ureg_dst rS = ureg_DECL_temporary(ureg);
792 
793         struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));
794 
795         struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
796         struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
797         struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
798         struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
799         struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
800         struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
801         struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
802         struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
803         struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
804         struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
805         struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
806         struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
807         struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
808         struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
809         struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));
810 
811         const unsigned loop_label = l++;
812 
813         /* Declare all light constants to allow indirect adressing */
814         for (i = 32; i < 96; i++)
815             ureg_DECL_constant(ureg, i);
816 
817         ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
818         ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
819         ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
820         ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
821 
822         /* loop management */
823         ureg_BGNLOOP(ureg, &label[loop_label]);
824         ureg_ARL(ureg, AL, _W(rCtr));
825 
826         /* if (not DIRECTIONAL light): */
827         ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
828         ureg_MOV(ureg, rHit, ureg_negate(cLDir));
829         ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
830         ureg_IF(ureg, _X(tmp), &label[l++]);
831         {
832             /* hitDir = light.position - eyeVtx
833              * d = length(hitDir)
834              */
835             ureg_ADD(ureg, rHit, cLPos, ureg_negate(vs->aVtx));
836             ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
837             ureg_RSQ(ureg, tmp_y, _X(tmp));
838             ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
839 
840             /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
841             ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
842             ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
843             ureg_RCP(ureg, rAtt, _W(rAtt));
844             /* cut-off if distance exceeds Light.Range */
845             ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
846             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
847         }
848         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
849         ureg_ENDIF(ureg);
850 
851         /* normalize hitDir */
852         ureg_normalize3(ureg, rHit, ureg_src(rHit));
853 
854         /* if (SPOT light) */
855         ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
856         ureg_IF(ureg, _X(tmp), &label[l++]);
857         {
858             /* rho = dp3(-hitDir, light.spotDir)
859              *
860              * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
861              *     spotAtt = 1
862              * else
863              * if (rho <= light.cphi2)
864              *     spotAtt = 0
865              * else
866              *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
867              */
868             ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
869             ureg_ADD(ureg, tmp_x, _Y(tmp), ureg_negate(cLPhi));
870             ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
871             ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
872             ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
873             ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
874             ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
875             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
876         }
877         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
878         ureg_ENDIF(ureg);
879 
880         /* directional factors, let's not use LIT because of clarity */
881 
882         if (has_aNrm) {
883             if (key->localviewer) {
884                 ureg_normalize3(ureg, rMid, vs->aVtx);
885                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
886             } else {
887                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, -1.0f));
888             }
889             ureg_normalize3(ureg, rMid, ureg_src(rMid));
890             ureg_DP3(ureg, ureg_saturate(tmp_x), vs->aNrm, ureg_src(rHit));
891             ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
892             ureg_MUL(ureg, tmp_z, _X(tmp), _Y(tmp));
893             /* Tests show that specular is computed only if (dp3(normal,hitDir) > 0).
894              * For front facing, it is more restrictive than test (dp3(normal,mid) > 0).
895              * No tests were made for backfacing, so add the two conditions */
896             ureg_IF(ureg, _Z(tmp), &label[l++]);
897             {
898                 ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
899                 ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);
900                 ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
901                 ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
902             }
903             ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
904             ureg_ENDIF(ureg);
905 
906             ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
907             ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
908         }
909 
910         ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */
911 
912         /* break if this was the last light */
913         ureg_IF(ureg, cLLast, &label[l++]);
914         ureg_BRK(ureg);
915         ureg_ENDIF(ureg);
916         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
917 
918         ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
919         ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
920         ureg_ENDLOOP(ureg, &label[loop_label]);
921 
922         /* Apply to material:
923          *
924          * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
925          *           material.ambient * ambient +
926          *           material.diffuse * diffuse +
927          * oCol[1] = material.specular * specular;
928          */
929         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
930             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), vs->mtlA, _CONST(19));
931         else {
932             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
933             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
934         }
935 
936         ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), ureg_src(rD), vs->mtlD, ureg_src(tmp));
937         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
938         ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
939         ureg_release_temporary(ureg, rAtt);
940         ureg_release_temporary(ureg, rHit);
941         ureg_release_temporary(ureg, rMid);
942         ureg_release_temporary(ureg, rCtr);
943         ureg_release_temporary(ureg, rD);
944         ureg_release_temporary(ureg, rA);
945         ureg_release_temporary(ureg, rS);
946         ureg_release_temporary(ureg, rAtt);
947         ureg_release_temporary(ureg, tmp);
948     } else
949     /* COLOR */
950     if (key->darkness) {
951         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
952             ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _CONST(19));
953         else
954             ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
955         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
956         ureg_MOV(ureg, oCol[1], ureg_imm1f(ureg, 0.0f));
957     } else {
958         ureg_MOV(ureg, oCol[0], vs->aCol[0]);
959         ureg_MOV(ureg, oCol[1], vs->aCol[1]);
960     }
961 
962     /* === Process fog.
963      *
964      * exp(x) = ex2(log2(e) * x)
965      */
966     if (key->fog_mode) {
967         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
968         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
969         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
970         if (key->fog_range) {
971             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
972             ureg_RSQ(ureg, tmp_z, _X(tmp));
973             ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
974         } else {
975             ureg_MOV(ureg, tmp_z, ureg_abs(_ZZZZ(vs->aVtx)));
976         }
977 
978         if (key->fog_mode == D3DFOG_EXP) {
979             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
980             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
981             ureg_EX2(ureg, tmp_x, _X(tmp));
982         } else
983         if (key->fog_mode == D3DFOG_EXP2) {
984             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
985             ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
986             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
987             ureg_EX2(ureg, tmp_x, _X(tmp));
988         } else
989         if (key->fog_mode == D3DFOG_LINEAR) {
990             ureg_ADD(ureg, tmp_x, _XXXX(_CONST(28)), ureg_negate(_Z(tmp)));
991             ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
992         }
993         ureg_MOV(ureg, oFog, _X(tmp));
994         ureg_release_temporary(ureg, tmp);
995     } else if (key->fog && !(key->passthrough & (1 << NINE_DECLUSAGE_FOG))) {
996         ureg_MOV(ureg, oFog, ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
997     }
998 
999     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT)) {
1000         struct ureg_src input;
1001         struct ureg_dst output;
1002         input = vs->aWgt;
1003         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 18);
1004         ureg_MOV(ureg, output, input);
1005     }
1006     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES)) {
1007         struct ureg_src input;
1008         struct ureg_dst output;
1009         input = vs->aInd;
1010         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 19);
1011         ureg_MOV(ureg, output, input);
1012     }
1013     if (key->passthrough & (1 << NINE_DECLUSAGE_NORMAL)) {
1014         struct ureg_src input;
1015         struct ureg_dst output;
1016         input = vs->aNrm;
1017         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 20);
1018         ureg_MOV(ureg, output, input);
1019     }
1020     if (key->passthrough & (1 << NINE_DECLUSAGE_TANGENT)) {
1021         struct ureg_src input;
1022         struct ureg_dst output;
1023         input = build_vs_add_input(vs, NINE_DECLUSAGE_TANGENT);
1024         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 21);
1025         ureg_MOV(ureg, output, input);
1026     }
1027     if (key->passthrough & (1 << NINE_DECLUSAGE_BINORMAL)) {
1028         struct ureg_src input;
1029         struct ureg_dst output;
1030         input = build_vs_add_input(vs, NINE_DECLUSAGE_BINORMAL);
1031         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 22);
1032         ureg_MOV(ureg, output, input);
1033     }
1034     if (key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
1035         struct ureg_src input;
1036         struct ureg_dst output;
1037         input = build_vs_add_input(vs, NINE_DECLUSAGE_FOG);
1038         input = ureg_scalar(input, TGSI_SWIZZLE_X);
1039         output = oFog;
1040         ureg_MOV(ureg, output, input);
1041     }
1042     if (key->passthrough & (1 << NINE_DECLUSAGE_DEPTH)) {
1043         (void) 0; /* TODO: replace z of position output ? */
1044     }
1045 
1046     /* ucp for ff applies on world coordinates.
1047      * aVtx is in worldview coordinates. */
1048     if (key->ucp) {
1049         struct ureg_dst clipVect = ureg_DECL_output(ureg, TGSI_SEMANTIC_CLIPVERTEX, 0);
1050         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1051         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(12));
1052         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(13),  ureg_src(tmp));
1053         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(14), ureg_src(tmp));
1054         ureg_ADD(ureg, clipVect, _CONST(15), ureg_src(tmp));
1055         ureg_release_temporary(ureg, tmp);
1056     }
1057 
1058     if (key->position_t && device->driver_caps.window_space_position_support)
1059         ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
1060 
1061     ureg_END(ureg);
1062     nine_ureg_tgsi_dump(ureg, FALSE);
1063     return ureg_create_shader_and_destroy(ureg, device->context.pipe);
1064 }
1065 
1066 /* PS FF constants layout:
1067  *
1068  * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
1069  * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
1070  * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
1071  * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
1072  * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
1073  * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
1074  * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
1075  *
1076  * CONST[20] D3DRS_TEXTUREFACTOR
1077  * CONST[21] D3DRS_FOGCOLOR
1078  * CONST[22].x___ RS.FogEnd
1079  * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
1080  * CONST[22].__z_ RS.FogDensity
1081  */
1082 struct ps_build_ctx
1083 {
1084     struct ureg_program *ureg;
1085 
1086     struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
1087     struct ureg_src vT[8]; /* TEXCOORD[i] */
1088     struct ureg_dst rCur; /* D3DTA_CURRENT */
1089     struct ureg_dst rMod;
1090     struct ureg_src rCurSrc;
1091     struct ureg_dst rTmp; /* D3DTA_TEMP */
1092     struct ureg_src rTmpSrc;
1093     struct ureg_dst rTex;
1094     struct ureg_src rTexSrc;
1095     struct ureg_src cBEM[8];
1096     struct ureg_src s[8];
1097 
1098     struct {
1099         unsigned index;
1100         unsigned index_pre_mod;
1101     } stage;
1102 };
1103 
1104 static struct ureg_src
ps_get_ts_arg(struct ps_build_ctx * ps,unsigned ta)1105 ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
1106 {
1107     struct ureg_src reg;
1108 
1109     switch (ta & D3DTA_SELECTMASK) {
1110     case D3DTA_CONSTANT:
1111         reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
1112         break;
1113     case D3DTA_CURRENT:
1114         reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
1115         break;
1116     case D3DTA_DIFFUSE:
1117         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
1118         break;
1119     case D3DTA_SPECULAR:
1120         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
1121         break;
1122     case D3DTA_TEMP:
1123         reg = ps->rTmpSrc;
1124         break;
1125     case D3DTA_TEXTURE:
1126         reg = ps->rTexSrc;
1127         break;
1128     case D3DTA_TFACTOR:
1129         reg = ureg_DECL_constant(ps->ureg, 20);
1130         break;
1131     default:
1132         assert(0);
1133         reg = ureg_src_undef();
1134         break;
1135     }
1136     if (ta & D3DTA_COMPLEMENT) {
1137         struct ureg_dst dst = ureg_DECL_temporary(ps->ureg);
1138         ureg_ADD(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), ureg_negate(reg));
1139         reg = ureg_src(dst);
1140     }
1141     if (ta & D3DTA_ALPHAREPLICATE)
1142         reg = _WWWW(reg);
1143     return reg;
1144 }
1145 
1146 static struct ureg_dst
ps_get_ts_dst(struct ps_build_ctx * ps,unsigned ta)1147 ps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
1148 {
1149     assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));
1150 
1151     switch (ta & D3DTA_SELECTMASK) {
1152     case D3DTA_CURRENT:
1153         return ps->rCur;
1154     case D3DTA_TEMP:
1155         return ps->rTmp;
1156     default:
1157         assert(0);
1158         return ureg_dst_undef();
1159     }
1160 }
1161 
ps_d3dtop_args_mask(D3DTEXTUREOP top)1162 static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
1163 {
1164     switch (top) {
1165     case D3DTOP_DISABLE:
1166         return 0x0;
1167     case D3DTOP_SELECTARG1:
1168     case D3DTOP_PREMODULATE:
1169         return 0x2;
1170     case D3DTOP_SELECTARG2:
1171         return 0x4;
1172     case D3DTOP_MULTIPLYADD:
1173     case D3DTOP_LERP:
1174         return 0x7;
1175     default:
1176         return 0x6;
1177     }
1178 }
1179 
1180 static inline boolean
is_MOV_no_op(struct ureg_dst dst,struct ureg_src src)1181 is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
1182 {
1183     return !dst.WriteMask ||
1184         (dst.File == src.File &&
1185          dst.Index == src.Index &&
1186          !dst.Indirect &&
1187          !dst.Saturate &&
1188          !src.Indirect &&
1189          !src.Negate &&
1190          !src.Absolute &&
1191          (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
1192          (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
1193          (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
1194          (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));
1195 
1196 }
1197 
1198 static void
ps_do_ts_op(struct ps_build_ctx * ps,unsigned top,struct ureg_dst dst,struct ureg_src * arg)1199 ps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
1200 {
1201     struct ureg_program *ureg = ps->ureg;
1202     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1203     struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
1204     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
1205 
1206     tmp.WriteMask = dst.WriteMask;
1207 
1208     if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
1209         top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
1210         top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
1211         top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
1212         top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
1213         top != D3DTOP_LERP)
1214         dst = ureg_saturate(dst);
1215 
1216     switch (top) {
1217     case D3DTOP_SELECTARG1:
1218         if (!is_MOV_no_op(dst, arg[1]))
1219             ureg_MOV(ureg, dst, arg[1]);
1220         break;
1221     case D3DTOP_SELECTARG2:
1222         if (!is_MOV_no_op(dst, arg[2]))
1223             ureg_MOV(ureg, dst, arg[2]);
1224         break;
1225     case D3DTOP_MODULATE:
1226         ureg_MUL(ureg, dst, arg[1], arg[2]);
1227         break;
1228     case D3DTOP_MODULATE2X:
1229         ureg_MUL(ureg, tmp, arg[1], arg[2]);
1230         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
1231         break;
1232     case D3DTOP_MODULATE4X:
1233         ureg_MUL(ureg, tmp, arg[1], arg[2]);
1234         ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
1235         break;
1236     case D3DTOP_ADD:
1237         ureg_ADD(ureg, dst, arg[1], arg[2]);
1238         break;
1239     case D3DTOP_ADDSIGNED:
1240         ureg_ADD(ureg, tmp, arg[1], arg[2]);
1241         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, -0.5f));
1242         break;
1243     case D3DTOP_ADDSIGNED2X:
1244         ureg_ADD(ureg, tmp, arg[1], arg[2]);
1245         ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
1246         break;
1247     case D3DTOP_SUBTRACT:
1248         ureg_ADD(ureg, dst, arg[1], ureg_negate(arg[2]));
1249         break;
1250     case D3DTOP_ADDSMOOTH:
1251         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
1252         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
1253         break;
1254     case D3DTOP_BLENDDIFFUSEALPHA:
1255         ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
1256         break;
1257     case D3DTOP_BLENDTEXTUREALPHA:
1258         /* XXX: alpha taken from previous stage, texture or result ? */
1259         ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
1260         break;
1261     case D3DTOP_BLENDFACTORALPHA:
1262         ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
1263         break;
1264     case D3DTOP_BLENDTEXTUREALPHAPM:
1265         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_W(ps->rTex)));
1266         ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
1267         break;
1268     case D3DTOP_BLENDCURRENTALPHA:
1269         ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
1270         break;
1271     case D3DTOP_PREMODULATE:
1272         ureg_MOV(ureg, dst, arg[1]);
1273         ps->stage.index_pre_mod = ps->stage.index + 1;
1274         break;
1275     case D3DTOP_MODULATEALPHA_ADDCOLOR:
1276         ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
1277         break;
1278     case D3DTOP_MODULATECOLOR_ADDALPHA:
1279         ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
1280         break;
1281     case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
1282         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_WWWW(arg[1])));
1283         ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
1284         break;
1285     case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
1286         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
1287         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
1288         break;
1289     case D3DTOP_BUMPENVMAP:
1290         break;
1291     case D3DTOP_BUMPENVMAPLUMINANCE:
1292         break;
1293     case D3DTOP_DOTPRODUCT3:
1294         ureg_ADD(ureg, tmp, arg[1], ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
1295         ureg_ADD(ureg, tmp2, arg[2] , ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
1296         ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
1297         ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
1298         break;
1299     case D3DTOP_MULTIPLYADD:
1300         ureg_MAD(ureg, dst, arg[1], arg[2], arg[0]);
1301         break;
1302     case D3DTOP_LERP:
1303         ureg_LRP(ureg, dst, arg[0], arg[1], arg[2]);
1304         break;
1305     case D3DTOP_DISABLE:
1306         /* no-op ? */
1307         break;
1308     default:
1309         assert(!"invalid D3DTOP");
1310         break;
1311     }
1312     ureg_release_temporary(ureg, tmp);
1313     ureg_release_temporary(ureg, tmp2);
1314 }
1315 
1316 static void *
nine_ff_build_ps(struct NineDevice9 * device,struct nine_ff_ps_key * key)1317 nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
1318 {
1319     struct ps_build_ctx ps;
1320     struct ureg_program *ureg = ureg_create(PIPE_SHADER_FRAGMENT);
1321     struct ureg_dst oCol;
1322     unsigned s;
1323     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
1324 
1325     memset(&ps, 0, sizeof(ps));
1326     ps.ureg = ureg;
1327     ps.stage.index_pre_mod = -1;
1328 
1329     ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
1330 
1331     ps.rCur = ureg_DECL_temporary(ureg);
1332     ps.rTmp = ureg_DECL_temporary(ureg);
1333     ps.rTex = ureg_DECL_temporary(ureg);
1334     ps.rCurSrc = ureg_src(ps.rCur);
1335     ps.rTmpSrc = ureg_src(ps.rTmp);
1336     ps.rTexSrc = ureg_src(ps.rTex);
1337 
1338     /* Initial values */
1339     ureg_MOV(ureg, ps.rCur, ps.vC[0]);
1340     ureg_MOV(ureg, ps.rTmp, ureg_imm1f(ureg, 0.0f));
1341     ureg_MOV(ureg, ps.rTex, ureg_imm1f(ureg, 0.0f));
1342 
1343     for (s = 0; s < 8; ++s) {
1344         ps.s[s] = ureg_src_undef();
1345 
1346         if (key->ts[s].colorop != D3DTOP_DISABLE) {
1347             if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
1348                 key->ts[s].colorarg1 == D3DTA_SPECULAR ||
1349                 key->ts[s].colorarg2 == D3DTA_SPECULAR)
1350                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
1351 
1352             if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
1353                 key->ts[s].colorarg1 == D3DTA_TEXTURE ||
1354                 key->ts[s].colorarg2 == D3DTA_TEXTURE) {
1355                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1356                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
1357             }
1358             if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
1359                       key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
1360                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1361         }
1362 
1363         if (key->ts[s].alphaop != D3DTOP_DISABLE) {
1364             if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
1365                 key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
1366                 key->ts[s].alphaarg2 == D3DTA_SPECULAR)
1367                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
1368 
1369             if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
1370                 key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
1371                 key->ts[s].alphaarg2 == D3DTA_TEXTURE) {
1372                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1373                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
1374             }
1375         }
1376     }
1377     if (key->specular)
1378         ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
1379 
1380     oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
1381 
1382     /* Run stages.
1383      */
1384     for (s = 0; s < 8; ++s) {
1385         unsigned colorarg[3];
1386         unsigned alphaarg[3];
1387         const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
1388         const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
1389         struct ureg_dst dst;
1390         struct ureg_src arg[3];
1391 
1392         if (key->ts[s].colorop == D3DTOP_DISABLE) {
1393             assert (key->ts[s].alphaop == D3DTOP_DISABLE);
1394             continue;
1395         }
1396         ps.stage.index = s;
1397 
1398         DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
1399             nine_D3DTOP_to_str(key->ts[s].colorop),
1400             nine_D3DTOP_to_str(key->ts[s].alphaop));
1401 
1402         if (!ureg_src_is_undef(ps.s[s])) {
1403             unsigned target;
1404             struct ureg_src texture_coord = ps.vT[s];
1405             struct ureg_dst delta;
1406             switch (key->ts[s].textarget) {
1407             case 0: target = TGSI_TEXTURE_1D; break;
1408             case 1: target = TGSI_TEXTURE_2D; break;
1409             case 2: target = TGSI_TEXTURE_3D; break;
1410             case 3: target = TGSI_TEXTURE_CUBE; break;
1411             /* this is a 2 bit bitfield, do I really need a default case ? */
1412             }
1413 
1414             /* Modify coordinates */
1415             if (s >= 1 &&
1416                 (key->ts[s-1].colorop == D3DTOP_BUMPENVMAP ||
1417                  key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)) {
1418                 delta = ureg_DECL_temporary(ureg);
1419                 /* Du' = D3DTSS_BUMPENVMAT00(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT10(stage s-1)*t(s-1)G */
1420                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _X(ps.rTex), _XXXX(_CONST(8 + s - 1)));
1421                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _Y(ps.rTex), _ZZZZ(_CONST(8 + s - 1)), ureg_src(delta));
1422                 /* Dv' = D3DTSS_BUMPENVMAT01(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT11(stage s-1)*t(s-1)G */
1423                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _X(ps.rTex), _YYYY(_CONST(8 + s - 1)));
1424                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _Y(ps.rTex), _WWWW(_CONST(8 + s - 1)), ureg_src(delta));
1425                 texture_coord = ureg_src(ureg_DECL_temporary(ureg));
1426                 ureg_MOV(ureg, ureg_writemask(ureg_dst(texture_coord), ureg_dst(ps.vT[s]).WriteMask), ps.vT[s]);
1427                 ureg_ADD(ureg, ureg_writemask(ureg_dst(texture_coord), TGSI_WRITEMASK_XY), texture_coord, ureg_src(delta));
1428                 /* Prepare luminance multiplier
1429                  * t(s)RGBA = t(s)RGBA * clamp[(t(s-1)B * D3DTSS_BUMPENVLSCALE(stage s-1)) + D3DTSS_BUMPENVLOFFSET(stage s-1)] */
1430                 if (key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
1431                     struct ureg_src bumpenvlscale = ((s-1) & 1) ? _ZZZZ(_CONST(16 + (s-1) / 2)) : _XXXX(_CONST(16 + (s-1) / 2));
1432                     struct ureg_src bumpenvloffset = ((s-1) & 1) ? _WWWW(_CONST(16 + (s-1) / 2)) : _YYYY(_CONST(16 + (s-1) / 2));
1433 
1434                     ureg_MAD(ureg, ureg_saturate(ureg_writemask(delta, TGSI_WRITEMASK_X)), _Z(ps.rTex), bumpenvlscale, bumpenvloffset);
1435                 }
1436             }
1437             if (key->projected & (3 << (s *2))) {
1438                 unsigned dim = 1 + ((key->projected >> (2 * s)) & 3);
1439                 if (dim == 4)
1440                     ureg_TXP(ureg, ps.rTex, target, texture_coord, ps.s[s]);
1441                 else {
1442                     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1443                     ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(texture_coord, dim-1));
1444                     ureg_MUL(ureg, ps.rTmp, _X(tmp), texture_coord);
1445                     ureg_TEX(ureg, ps.rTex, target, ps.rTmpSrc, ps.s[s]);
1446                     ureg_release_temporary(ureg, tmp);
1447                 }
1448             } else {
1449                 ureg_TEX(ureg, ps.rTex, target, texture_coord, ps.s[s]);
1450             }
1451             if (s >= 1 && key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
1452                 ureg_MUL(ureg, ps.rTex, ureg_src(ps.rTex), _X(delta));
1453         }
1454 
1455         if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
1456             key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
1457             continue;
1458 
1459         dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);
1460 
1461         if (ps.stage.index_pre_mod == ps.stage.index) {
1462             ps.rMod = ureg_DECL_temporary(ureg);
1463             ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
1464         }
1465 
1466         colorarg[0] = (key->ts[s].colorarg0 | ((key->colorarg_b4[0] >> s) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
1467         colorarg[1] = (key->ts[s].colorarg1 | ((key->colorarg_b4[1] >> s) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
1468         colorarg[2] = (key->ts[s].colorarg2 | ((key->colorarg_b4[2] >> s) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
1469         alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
1470         alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
1471         alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;
1472 
1473         if (key->ts[s].colorop != key->ts[s].alphaop ||
1474             colorarg[0] != alphaarg[0] ||
1475             colorarg[1] != alphaarg[1] ||
1476             colorarg[2] != alphaarg[2])
1477             dst.WriteMask = TGSI_WRITEMASK_XYZ;
1478 
1479         /* Special DOTPRODUCT behaviour (see wine tests) */
1480         if (key->ts[s].colorop == D3DTOP_DOTPRODUCT3)
1481             dst.WriteMask = TGSI_WRITEMASK_XYZW;
1482 
1483         if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
1484         if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
1485         if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
1486         ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);
1487 
1488         if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
1489             dst.WriteMask = TGSI_WRITEMASK_W;
1490 
1491             if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
1492             if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
1493             if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
1494             ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
1495         }
1496     }
1497 
1498     if (key->specular)
1499         ureg_ADD(ureg, ureg_writemask(ps.rCur, TGSI_WRITEMASK_XYZ), ps.rCurSrc, ps.vC[1]);
1500 
1501     /* Fog.
1502      */
1503     if (key->fog_mode) {
1504         struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
1505         struct ureg_src vPos;
1506         if (device->screen->get_param(device->screen,
1507                                       PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
1508             vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
1509         } else {
1510             vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
1511                                       TGSI_INTERPOLATE_LINEAR);
1512         }
1513 
1514         /* Source is either W or Z.
1515          * When we use vs ff,
1516          * Z is when an orthogonal projection matrix is detected,
1517          * W (WFOG) else.
1518          * Z is used for programmable vs.
1519          * Note: Tests indicate that the projection matrix coefficients do
1520          * actually affect pixel fog (and not vertex fog) when vs ff is used,
1521          * which justifies taking the position's w instead of taking the z coordinate
1522          * before the projection in the vs shader.
1523          */
1524         if (!key->fog_source)
1525             ureg_MOV(ureg, rFog, _ZZZZ(vPos));
1526         else
1527             /* Position's w is 1/w */
1528             ureg_RCP(ureg, rFog, _WWWW(vPos));
1529 
1530         if (key->fog_mode == D3DFOG_EXP) {
1531             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
1532             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
1533             ureg_EX2(ureg, rFog, _X(rFog));
1534         } else
1535         if (key->fog_mode == D3DFOG_EXP2) {
1536             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
1537             ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
1538             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
1539             ureg_EX2(ureg, rFog, _X(rFog));
1540         } else
1541         if (key->fog_mode == D3DFOG_LINEAR) {
1542             ureg_ADD(ureg, rFog, _XXXX(_CONST(22)), ureg_negate(_X(rFog)));
1543             ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
1544         }
1545         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
1546         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
1547     } else
1548     if (key->fog) {
1549         struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_FOG, 0, TGSI_INTERPOLATE_PERSPECTIVE);
1550         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
1551         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
1552     } else {
1553         ureg_MOV(ureg, oCol, ps.rCurSrc);
1554     }
1555 
1556     ureg_END(ureg);
1557     nine_ureg_tgsi_dump(ureg, FALSE);
1558     return ureg_create_shader_and_destroy(ureg, device->context.pipe);
1559 }
1560 
1561 static struct NineVertexShader9 *
nine_ff_get_vs(struct NineDevice9 * device)1562 nine_ff_get_vs(struct NineDevice9 *device)
1563 {
1564     const struct nine_context *context = &device->context;
1565     struct NineVertexShader9 *vs;
1566     enum pipe_error err;
1567     struct vs_build_ctx bld;
1568     struct nine_ff_vs_key key;
1569     unsigned s, i;
1570     boolean has_indexes = false;
1571     boolean has_weights = false;
1572     char input_texture_coord[8];
1573 
1574     assert(sizeof(key) <= sizeof(key.value32));
1575 
1576     memset(&key, 0, sizeof(key));
1577     memset(&bld, 0, sizeof(bld));
1578     memset(&input_texture_coord, 0, sizeof(input_texture_coord));
1579 
1580     bld.key = &key;
1581 
1582     /* FIXME: this shouldn't be NULL, but it is on init */
1583     if (context->vdecl) {
1584         key.color0in_one = 1;
1585         key.color1in_zero = 1;
1586         for (i = 0; i < context->vdecl->nelems; i++) {
1587             uint16_t usage = context->vdecl->usage_map[i];
1588             if (usage == NINE_DECLUSAGE_POSITIONT)
1589                 key.position_t = 1;
1590             else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
1591                 key.color0in_one = 0;
1592             else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
1593                 key.color1in_zero = 0;
1594             else if (usage == NINE_DECLUSAGE_i(BLENDINDICES, 0)) {
1595                 has_indexes = true;
1596                 key.passthrough |= 1 << usage;
1597             } else if (usage == NINE_DECLUSAGE_i(BLENDWEIGHT, 0)) {
1598                 has_weights = true;
1599                 key.passthrough |= 1 << usage;
1600             } else if (usage == NINE_DECLUSAGE_i(NORMAL, 0)) {
1601                 key.has_normal = 1;
1602                 key.passthrough |= 1 << usage;
1603             } else if (usage == NINE_DECLUSAGE_PSIZE)
1604                 key.vertexpointsize = 1;
1605             else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
1606                 s = usage / NINE_DECLUSAGE_COUNT;
1607                 if (s < 8)
1608                     input_texture_coord[s] = nine_decltype_get_dim(context->vdecl->decls[i].Type);
1609                 else
1610                     DBG("FF given texture coordinate >= 8. Ignoring\n");
1611             } else if (usage < NINE_DECLUSAGE_NONE)
1612                 key.passthrough |= 1 << usage;
1613         }
1614     }
1615     /* ff vs + ps 3.0: some elements are passed to the ps (wine test).
1616      * We do restrict to indices 0 */
1617     key.passthrough &= ~((1 << NINE_DECLUSAGE_POSITION) | (1 << NINE_DECLUSAGE_PSIZE) |
1618                          (1 << NINE_DECLUSAGE_TEXCOORD) | (1 << NINE_DECLUSAGE_POSITIONT) |
1619                          (1 << NINE_DECLUSAGE_TESSFACTOR) | (1 << NINE_DECLUSAGE_SAMPLE));
1620     if (!key.position_t)
1621         key.passthrough = 0;
1622     key.pointscale = !!context->rs[D3DRS_POINTSCALEENABLE];
1623 
1624     key.lighting = !!context->rs[D3DRS_LIGHTING] &&  context->ff.num_lights_active;
1625     key.darkness = !!context->rs[D3DRS_LIGHTING] && !context->ff.num_lights_active;
1626     if (key.position_t) {
1627         key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
1628         key.lighting = 0;
1629     }
1630     if ((key.lighting | key.darkness) && context->rs[D3DRS_COLORVERTEX]) {
1631         uint32_t mask = (key.color0in_one ? 0 : 1) | (key.color1in_zero ? 0 : 2);
1632         key.mtl_diffuse = context->rs[D3DRS_DIFFUSEMATERIALSOURCE] & mask;
1633         key.mtl_ambient = context->rs[D3DRS_AMBIENTMATERIALSOURCE] & mask;
1634         key.mtl_specular = context->rs[D3DRS_SPECULARMATERIALSOURCE] & mask;
1635         key.mtl_emissive = context->rs[D3DRS_EMISSIVEMATERIALSOURCE] & mask;
1636     }
1637     key.fog = !!context->rs[D3DRS_FOGENABLE];
1638     key.fog_mode = (!key.position_t && context->rs[D3DRS_FOGENABLE]) ? context->rs[D3DRS_FOGVERTEXMODE] : 0;
1639     if (key.fog_mode)
1640         key.fog_range = context->rs[D3DRS_RANGEFOGENABLE];
1641 
1642     key.localviewer = !!context->rs[D3DRS_LOCALVIEWER];
1643     key.normalizenormals = !!context->rs[D3DRS_NORMALIZENORMALS];
1644     key.ucp = !!context->rs[D3DRS_CLIPPLANEENABLE];
1645 
1646     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
1647         key.vertexblend_indexed = !!context->rs[D3DRS_INDEXEDVERTEXBLENDENABLE] && has_indexes;
1648 
1649         switch (context->rs[D3DRS_VERTEXBLEND]) {
1650         case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
1651         case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
1652         case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
1653         case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
1654         case D3DVBF_TWEENING: key.vertextween = 1; break;
1655         default:
1656             assert(!"invalid D3DVBF");
1657             break;
1658         }
1659         if (!has_weights && context->rs[D3DRS_VERTEXBLEND] != D3DVBF_0WEIGHTS)
1660             key.vertexblend = 0; /* TODO: if key.vertexblend_indexed, perhaps it should use 1.0 as weight, or revert to D3DVBF_0WEIGHTS */
1661     }
1662 
1663     for (s = 0; s < 8; ++s) {
1664         unsigned gen = (context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
1665         unsigned idx = context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7;
1666         unsigned dim;
1667 
1668         if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
1669             gen = NINED3DTSS_TCI_PASSTHRU;
1670 
1671         if (!input_texture_coord[idx] && gen == NINED3DTSS_TCI_PASSTHRU)
1672             gen = NINED3DTSS_TCI_DISABLE;
1673 
1674         key.tc_gen |= gen << (s * 3);
1675         key.tc_idx |= idx << (s * 3);
1676         key.tc_dim_input |= ((input_texture_coord[idx]-1) & 0x3) << (s * 2);
1677 
1678         dim = context->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
1679         if (dim > 4)
1680             dim = input_texture_coord[idx];
1681         if (dim == 1) /* NV behaviour */
1682             dim = 0;
1683         key.tc_dim_output |= dim << (s * 3);
1684     }
1685 
1686     vs = util_hash_table_get(device->ff.ht_vs, &key);
1687     if (vs)
1688         return vs;
1689     NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));
1690 
1691     nine_ff_prune_vs(device);
1692     if (vs) {
1693         unsigned n;
1694 
1695         memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
1696 
1697         err = util_hash_table_set(device->ff.ht_vs, &vs->ff_key, vs);
1698         (void)err;
1699         assert(err == PIPE_OK);
1700         device->ff.num_vs++;
1701         NineUnknown_ConvertRefToBind(NineUnknown(vs));
1702 
1703         vs->num_inputs = bld.num_inputs;
1704         for (n = 0; n < bld.num_inputs; ++n)
1705             vs->input_map[n].ndecl = bld.input[n];
1706 
1707         vs->position_t = key.position_t;
1708         vs->point_size = key.vertexpointsize | key.pointscale;
1709     }
1710     return vs;
1711 }
1712 
1713 #define GET_D3DTS(n) nine_state_access_transform(&context->ff, D3DTS_##n, FALSE)
1714 #define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
1715 
1716 static struct NinePixelShader9 *
nine_ff_get_ps(struct NineDevice9 * device)1717 nine_ff_get_ps(struct NineDevice9 *device)
1718 {
1719     struct nine_context *context = &device->context;
1720     D3DMATRIX *projection_matrix = GET_D3DTS(PROJECTION);
1721     struct NinePixelShader9 *ps;
1722     enum pipe_error err;
1723     struct nine_ff_ps_key key;
1724     unsigned s;
1725     uint8_t sampler_mask = 0;
1726 
1727     assert(sizeof(key) <= sizeof(key.value32));
1728 
1729     memset(&key, 0, sizeof(key));
1730     for (s = 0; s < 8; ++s) {
1731         key.ts[s].colorop = context->ff.tex_stage[s][D3DTSS_COLOROP];
1732         key.ts[s].alphaop = context->ff.tex_stage[s][D3DTSS_ALPHAOP];
1733         const uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
1734         const uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
1735         /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages.
1736          * ALPHAOP cannot be enabled if COLOROP is disabled.
1737          * Verified on Windows. */
1738         if (key.ts[s].colorop == D3DTOP_DISABLE) {
1739             key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
1740             break;
1741         }
1742 
1743         if (!context->texture[s].enabled &&
1744             ((context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE &&
1745               used_c & 0x1) ||
1746              (context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE &&
1747               used_c & 0x2) ||
1748              (context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE &&
1749               used_c & 0x4))) {
1750             /* Tested on Windows: Invalid texture read disables the stage
1751              * and the subsequent ones, but only for colorop. For alpha,
1752              * it's as if the texture had alpha of 1.0, which is what
1753              * has our dummy texture in that case. Invalid color also
1754              * disabled the following alpha stages. */
1755             key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
1756             break;
1757         }
1758 
1759         if (context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE ||
1760             context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE ||
1761             context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE ||
1762             context->ff.tex_stage[s][D3DTSS_ALPHAARG0] == D3DTA_TEXTURE ||
1763             context->ff.tex_stage[s][D3DTSS_ALPHAARG1] == D3DTA_TEXTURE ||
1764             context->ff.tex_stage[s][D3DTSS_ALPHAARG2] == D3DTA_TEXTURE)
1765             sampler_mask |= (1 << s);
1766 
1767         if (key.ts[s].colorop != D3DTOP_DISABLE) {
1768             if (used_c & 0x1) key.ts[s].colorarg0 = context->ff.tex_stage[s][D3DTSS_COLORARG0];
1769             if (used_c & 0x2) key.ts[s].colorarg1 = context->ff.tex_stage[s][D3DTSS_COLORARG1];
1770             if (used_c & 0x4) key.ts[s].colorarg2 = context->ff.tex_stage[s][D3DTSS_COLORARG2];
1771             if (used_c & 0x1) key.colorarg_b4[0] |= (context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) << s;
1772             if (used_c & 0x1) key.colorarg_b5[0] |= (context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) << s;
1773             if (used_c & 0x2) key.colorarg_b4[1] |= (context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) << s;
1774             if (used_c & 0x2) key.colorarg_b5[1] |= (context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) << s;
1775             if (used_c & 0x4) key.colorarg_b4[2] |= (context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) << s;
1776             if (used_c & 0x4) key.colorarg_b5[2] |= (context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) << s;
1777         }
1778         if (key.ts[s].alphaop != D3DTOP_DISABLE) {
1779             if (used_a & 0x1) key.ts[s].alphaarg0 = context->ff.tex_stage[s][D3DTSS_ALPHAARG0];
1780             if (used_a & 0x2) key.ts[s].alphaarg1 = context->ff.tex_stage[s][D3DTSS_ALPHAARG1];
1781             if (used_a & 0x4) key.ts[s].alphaarg2 = context->ff.tex_stage[s][D3DTSS_ALPHAARG2];
1782             if (used_a & 0x1) key.alphaarg_b4[0] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) << s;
1783             if (used_a & 0x2) key.alphaarg_b4[1] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) << s;
1784             if (used_a & 0x4) key.alphaarg_b4[2] |= (context->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) << s;
1785         }
1786         key.ts[s].resultarg = context->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
1787 
1788         if (context->texture[s].enabled) {
1789             switch (context->texture[s].type) {
1790             case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
1791             case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
1792             case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
1793             default:
1794                 assert(!"unexpected texture type");
1795                 break;
1796             }
1797         } else {
1798             key.ts[s].textarget = 1;
1799         }
1800     }
1801 
1802     /* Note: If colorop is D3DTOP_DISABLE for the first stage
1803      * (which implies alphaop is too), nothing particular happens,
1804      * that is, current is equal to diffuse (which is the case anyway,
1805      * because it is how it is initialized).
1806      * Special case seems if alphaop is D3DTOP_DISABLE and not colorop,
1807      * because then if the resultarg is TEMP, then diffuse alpha is written
1808      * to it. */
1809     if (key.ts[0].colorop != D3DTOP_DISABLE &&
1810         key.ts[0].alphaop == D3DTOP_DISABLE &&
1811         key.ts[0].resultarg != 0) {
1812         key.ts[0].alphaop = D3DTOP_SELECTARG1;
1813         key.ts[0].alphaarg1 = D3DTA_DIFFUSE;
1814     }
1815     /* When no alpha stage writes to current, diffuse alpha is taken.
1816      * Since we initialize current to diffuse, we have the behaviour. */
1817 
1818     /* Last stage always writes to Current */
1819     if (s >= 1)
1820         key.ts[s-1].resultarg = 0;
1821 
1822     key.projected = nine_ff_get_projected_key(context);
1823     key.specular = !!context->rs[D3DRS_SPECULARENABLE];
1824 
1825     for (; s < 8; ++s)
1826         key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
1827     if (context->rs[D3DRS_FOGENABLE])
1828         key.fog_mode = context->rs[D3DRS_FOGTABLEMODE];
1829     key.fog = !!context->rs[D3DRS_FOGENABLE];
1830     /* Pixel fog (with WFOG advertised): source is either Z or W.
1831      * W is the source if vs ff is used, and the
1832      * projection matrix is not orthogonal.
1833      * Tests on Win 10 seem to indicate _34
1834      * and _33 are checked against 0, 1. */
1835     if (key.fog_mode && key.fog)
1836         key.fog_source = !context->programmable_vs &&
1837             !(projection_matrix->_34 == 0.0f &&
1838               projection_matrix->_44 == 1.0f);
1839 
1840     ps = util_hash_table_get(device->ff.ht_ps, &key);
1841     if (ps)
1842         return ps;
1843     NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));
1844 
1845     nine_ff_prune_ps(device);
1846     if (ps) {
1847         memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
1848 
1849         err = util_hash_table_set(device->ff.ht_ps, &ps->ff_key, ps);
1850         (void)err;
1851         assert(err == PIPE_OK);
1852         device->ff.num_ps++;
1853         NineUnknown_ConvertRefToBind(NineUnknown(ps));
1854 
1855         ps->rt_mask = 0x1;
1856         ps->sampler_mask = sampler_mask;
1857     }
1858     return ps;
1859 }
1860 
1861 static void
nine_ff_load_vs_transforms(struct NineDevice9 * device)1862 nine_ff_load_vs_transforms(struct NineDevice9 *device)
1863 {
1864     struct nine_context *context = &device->context;
1865     D3DMATRIX T;
1866     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
1867     unsigned i;
1868 
1869     /* TODO: make this nicer, and only upload the ones we need */
1870     /* TODO: use ff.vs_const as storage of W, V, P matrices */
1871 
1872     if (IS_D3DTS_DIRTY(context, WORLD) ||
1873         IS_D3DTS_DIRTY(context, VIEW) ||
1874         IS_D3DTS_DIRTY(context, PROJECTION)) {
1875         /* WVP, WV matrices */
1876         nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
1877         nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));
1878 
1879         /* normal matrix == transpose(inverse(WV)) */
1880         nine_d3d_matrix_inverse(&T, &M[1]);
1881         nine_d3d_matrix_transpose(&M[4], &T);
1882 
1883         /* P matrix */
1884         M[2] = *GET_D3DTS(PROJECTION);
1885 
1886         /* V and W matrix */
1887         nine_d3d_matrix_inverse(&M[3], GET_D3DTS(VIEW));
1888         M[40] = M[1];
1889     }
1890 
1891     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
1892         /* load other world matrices */
1893         for (i = 1; i <= 8; ++i) {
1894             nine_d3d_matrix_matrix_mul(&M[40 + i], GET_D3DTS(WORLDMATRIX(i)), GET_D3DTS(VIEW));
1895         }
1896     }
1897 
1898     device->ff.vs_const[30 * 4] = asfloat(context->rs[D3DRS_TWEENFACTOR]);
1899 }
1900 
1901 static void
nine_ff_load_lights(struct NineDevice9 * device)1902 nine_ff_load_lights(struct NineDevice9 *device)
1903 {
1904     struct nine_context *context = &device->context;
1905     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
1906     unsigned l;
1907 
1908     if (context->changed.group & NINE_STATE_FF_MATERIAL) {
1909         const D3DMATERIAL9 *mtl = &context->ff.material;
1910 
1911         memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
1912         memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
1913         memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
1914         dst[23].x = mtl->Power;
1915         memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
1916         d3dcolor_to_rgba(&dst[25].x, context->rs[D3DRS_AMBIENT]);
1917         dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
1918         dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
1919         dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
1920     }
1921 
1922     if (!(context->changed.group & NINE_STATE_FF_LIGHTING))
1923         return;
1924 
1925     for (l = 0; l < context->ff.num_lights_active; ++l) {
1926         const D3DLIGHT9 *light = &context->ff.light[context->ff.active_light[l]];
1927 
1928         dst[32 + l * 8].x = light->Type;
1929         dst[32 + l * 8].y = light->Attenuation0;
1930         dst[32 + l * 8].z = light->Attenuation1;
1931         dst[32 + l * 8].w = light->Attenuation2;
1932         memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
1933         memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
1934         memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
1935         nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
1936         nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
1937         dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
1938         dst[37 + l * 8].w = light->Falloff;
1939         dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
1940         dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
1941         dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
1942         dst[39 + l * 8].w = (float)((l + 1) == context->ff.num_lights_active);
1943     }
1944 }
1945 
1946 static void
nine_ff_load_point_and_fog_params(struct NineDevice9 * device)1947 nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
1948 {
1949     struct nine_context *context = &device->context;
1950     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
1951 
1952     if (!(context->changed.group & NINE_STATE_FF_OTHER))
1953         return;
1954     dst[26].x = asfloat(context->rs[D3DRS_POINTSIZE_MIN]);
1955     dst[26].y = asfloat(context->rs[D3DRS_POINTSIZE_MAX]);
1956     dst[26].z = asfloat(context->rs[D3DRS_POINTSIZE]);
1957     dst[26].w = asfloat(context->rs[D3DRS_POINTSCALE_A]);
1958     dst[27].x = asfloat(context->rs[D3DRS_POINTSCALE_B]);
1959     dst[27].y = asfloat(context->rs[D3DRS_POINTSCALE_C]);
1960     dst[28].x = asfloat(context->rs[D3DRS_FOGEND]);
1961     dst[28].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
1962     if (isinf(dst[28].y))
1963         dst[28].y = 0.0f;
1964     dst[28].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
1965 }
1966 
1967 static void
nine_ff_load_tex_matrices(struct NineDevice9 * device)1968 nine_ff_load_tex_matrices(struct NineDevice9 *device)
1969 {
1970     struct nine_context *context = &device->context;
1971     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
1972     unsigned s;
1973 
1974     if (!(context->ff.changed.transform[0] & 0xff0000))
1975         return;
1976     for (s = 0; s < 8; ++s) {
1977         if (IS_D3DTS_DIRTY(context, TEXTURE0 + s))
1978             nine_d3d_matrix_transpose(&M[32 + s], nine_state_access_transform(&context->ff, D3DTS_TEXTURE0 + s, FALSE));
1979     }
1980 }
1981 
1982 static void
nine_ff_load_ps_params(struct NineDevice9 * device)1983 nine_ff_load_ps_params(struct NineDevice9 *device)
1984 {
1985     struct nine_context *context = &device->context;
1986     struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
1987     unsigned s;
1988 
1989     if (!(context->changed.group & (NINE_STATE_FF_PSSTAGES | NINE_STATE_FF_OTHER)))
1990         return;
1991 
1992     for (s = 0; s < 8; ++s)
1993         d3dcolor_to_rgba(&dst[s].x, context->ff.tex_stage[s][D3DTSS_CONSTANT]);
1994 
1995     for (s = 0; s < 8; ++s) {
1996         dst[8 + s].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
1997         dst[8 + s].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
1998         dst[8 + s].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
1999         dst[8 + s].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
2000         if (s & 1) {
2001             dst[16 + s / 2].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
2002             dst[16 + s / 2].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
2003         } else {
2004             dst[16 + s / 2].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
2005             dst[16 + s / 2].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
2006         }
2007     }
2008 
2009     d3dcolor_to_rgba(&dst[20].x, context->rs[D3DRS_TEXTUREFACTOR]);
2010     d3dcolor_to_rgba(&dst[21].x, context->rs[D3DRS_FOGCOLOR]);
2011     dst[22].x = asfloat(context->rs[D3DRS_FOGEND]);
2012     dst[22].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
2013     dst[22].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
2014 }
2015 
2016 static void
nine_ff_load_viewport_info(struct NineDevice9 * device)2017 nine_ff_load_viewport_info(struct NineDevice9 *device)
2018 {
2019     D3DVIEWPORT9 *viewport = &device->context.viewport;
2020     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
2021     float diffZ = viewport->MaxZ - viewport->MinZ;
2022 
2023     /* Note: the other functions avoids to fill the const again if nothing changed.
2024      * But we don't have much to fill, and adding code to allow that may be complex
2025      * so just fill it always */
2026     dst[100].x = 2.0f / (float)(viewport->Width);
2027     dst[100].y = 2.0f / (float)(viewport->Height);
2028     dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
2029     dst[100].w = (float)(viewport->Width);
2030     dst[101].x = (float)(viewport->X);
2031     dst[101].y = (float)(viewport->Y);
2032     dst[101].z = (float)(viewport->MinZ);
2033 }
2034 
2035 void
nine_ff_update(struct NineDevice9 * device)2036 nine_ff_update(struct NineDevice9 *device)
2037 {
2038     struct nine_context *context = &device->context;
2039     struct pipe_constant_buffer cb;
2040 
2041     DBG("vs=%p ps=%p\n", context->vs, context->ps);
2042 
2043     /* NOTE: the only reference belongs to the hash table */
2044     if (!context->programmable_vs) {
2045         device->ff.vs = nine_ff_get_vs(device);
2046         context->changed.group |= NINE_STATE_VS;
2047     }
2048     if (!context->ps) {
2049         device->ff.ps = nine_ff_get_ps(device);
2050         context->changed.group |= NINE_STATE_PS;
2051     }
2052 
2053     if (!context->programmable_vs) {
2054         nine_ff_load_vs_transforms(device);
2055         nine_ff_load_tex_matrices(device);
2056         nine_ff_load_lights(device);
2057         nine_ff_load_point_and_fog_params(device);
2058         nine_ff_load_viewport_info(device);
2059 
2060         memset(context->ff.changed.transform, 0, sizeof(context->ff.changed.transform));
2061 
2062         cb.buffer_offset = 0;
2063         cb.buffer = NULL;
2064         cb.user_buffer = device->ff.vs_const;
2065         cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
2066 
2067         context->pipe_data.cb_vs_ff = cb;
2068         context->commit |= NINE_STATE_COMMIT_CONST_VS;
2069     }
2070 
2071     if (!context->ps) {
2072         nine_ff_load_ps_params(device);
2073 
2074         cb.buffer_offset = 0;
2075         cb.buffer = NULL;
2076         cb.user_buffer = device->ff.ps_const;
2077         cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
2078 
2079         context->pipe_data.cb_ps_ff = cb;
2080         context->commit |= NINE_STATE_COMMIT_CONST_PS;
2081     }
2082 
2083     context->changed.group &= ~NINE_STATE_FF;
2084 }
2085 
2086 
2087 boolean
nine_ff_init(struct NineDevice9 * device)2088 nine_ff_init(struct NineDevice9 *device)
2089 {
2090     device->ff.ht_vs = util_hash_table_create(nine_ff_vs_key_hash,
2091                                               nine_ff_vs_key_comp);
2092     device->ff.ht_ps = util_hash_table_create(nine_ff_ps_key_hash,
2093                                               nine_ff_ps_key_comp);
2094 
2095     device->ff.ht_fvf = util_hash_table_create(nine_ff_fvf_key_hash,
2096                                                nine_ff_fvf_key_comp);
2097 
2098     device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
2099     device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));
2100 
2101     return device->ff.ht_vs && device->ff.ht_ps &&
2102         device->ff.ht_fvf &&
2103         device->ff.vs_const && device->ff.ps_const;
2104 }
2105 
nine_ff_ht_delete_cb(void * key,void * value,void * data)2106 static enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
2107 {
2108     NineUnknown_Unbind(NineUnknown(value));
2109     return PIPE_OK;
2110 }
2111 
2112 void
nine_ff_fini(struct NineDevice9 * device)2113 nine_ff_fini(struct NineDevice9 *device)
2114 {
2115     if (device->ff.ht_vs) {
2116         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
2117         util_hash_table_destroy(device->ff.ht_vs);
2118     }
2119     if (device->ff.ht_ps) {
2120         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
2121         util_hash_table_destroy(device->ff.ht_ps);
2122     }
2123     if (device->ff.ht_fvf) {
2124         util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
2125         util_hash_table_destroy(device->ff.ht_fvf);
2126     }
2127     device->ff.vs = NULL; /* destroyed by unbinding from hash table */
2128     device->ff.ps = NULL;
2129 
2130     FREE(device->ff.vs_const);
2131     FREE(device->ff.ps_const);
2132 }
2133 
2134 static void
nine_ff_prune_vs(struct NineDevice9 * device)2135 nine_ff_prune_vs(struct NineDevice9 *device)
2136 {
2137     struct nine_context *context = &device->context;
2138 
2139     if (device->ff.num_vs > 100) {
2140         /* could destroy the bound one here, so unbind */
2141         context->pipe->bind_vs_state(context->pipe, NULL);
2142         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
2143         util_hash_table_clear(device->ff.ht_vs);
2144         device->ff.num_vs = 0;
2145         context->changed.group |= NINE_STATE_VS;
2146     }
2147 }
2148 static void
nine_ff_prune_ps(struct NineDevice9 * device)2149 nine_ff_prune_ps(struct NineDevice9 *device)
2150 {
2151     struct nine_context *context = &device->context;
2152 
2153     if (device->ff.num_ps > 100) {
2154         /* could destroy the bound one here, so unbind */
2155         context->pipe->bind_fs_state(context->pipe, NULL);
2156         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
2157         util_hash_table_clear(device->ff.ht_ps);
2158         device->ff.num_ps = 0;
2159         context->changed.group |= NINE_STATE_PS;
2160     }
2161 }
2162 
2163 /* ========================================================================== */
2164 
2165 /* Matrix multiplication:
2166  *
2167  * in memory: 0 1 2 3 (row major)
2168  *            4 5 6 7
2169  *            8 9 a b
2170  *            c d e f
2171  *
2172  *    cA cB cC cD
2173  * r0             = (r0 * cA) (r0 * cB) . .
2174  * r1             = (r1 * cA) (r1 * cB)
2175  * r2             = (r2 * cA) .
2176  * r3             = (r3 * cA) .
2177  *
2178  *               r: (11) (12) (13) (14)
2179  *                  (21) (22) (23) (24)
2180  *                  (31) (32) (33) (34)
2181  *                  (41) (42) (43) (44)
2182  * l: (11 12 13 14)
2183  *    (21 22 23 24)
2184  *    (31 32 33 34)
2185  *    (41 42 43 44)
2186  *
2187  * v: (x  y  z  1 )
2188  *
2189  * t.xyzw = MUL(v.xxxx, r[0]);
2190  * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
2191  * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
2192  * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
2193  *
2194  * v.x = DP4(v, c[0]);
2195  * v.y = DP4(v, c[1]);
2196  * v.z = DP4(v, c[2]);
2197  * v.w = DP4(v, c[3]) = 1
2198  */
2199 
2200 /*
2201 static void
2202 nine_D3DMATRIX_print(const D3DMATRIX *M)
2203 {
2204     DBG("\n(%f %f %f %f)\n"
2205         "(%f %f %f %f)\n"
2206         "(%f %f %f %f)\n"
2207         "(%f %f %f %f)\n",
2208         M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
2209         M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
2210         M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
2211         M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
2212 }
2213 */
2214 
2215 static inline float
nine_DP4_row_col(const D3DMATRIX * A,int r,const D3DMATRIX * B,int c)2216 nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
2217 {
2218     return A->m[r][0] * B->m[0][c] +
2219            A->m[r][1] * B->m[1][c] +
2220            A->m[r][2] * B->m[2][c] +
2221            A->m[r][3] * B->m[3][c];
2222 }
2223 
2224 static inline float
nine_DP4_vec_col(const D3DVECTOR * v,const D3DMATRIX * M,int c)2225 nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
2226 {
2227     return v->x * M->m[0][c] +
2228            v->y * M->m[1][c] +
2229            v->z * M->m[2][c] +
2230            1.0f * M->m[3][c];
2231 }
2232 
2233 static inline float
nine_DP3_vec_col(const D3DVECTOR * v,const D3DMATRIX * M,int c)2234 nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
2235 {
2236     return v->x * M->m[0][c] +
2237            v->y * M->m[1][c] +
2238            v->z * M->m[2][c];
2239 }
2240 
2241 void
nine_d3d_matrix_matrix_mul(D3DMATRIX * D,const D3DMATRIX * L,const D3DMATRIX * R)2242 nine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
2243 {
2244     D->_11 = nine_DP4_row_col(L, 0, R, 0);
2245     D->_12 = nine_DP4_row_col(L, 0, R, 1);
2246     D->_13 = nine_DP4_row_col(L, 0, R, 2);
2247     D->_14 = nine_DP4_row_col(L, 0, R, 3);
2248 
2249     D->_21 = nine_DP4_row_col(L, 1, R, 0);
2250     D->_22 = nine_DP4_row_col(L, 1, R, 1);
2251     D->_23 = nine_DP4_row_col(L, 1, R, 2);
2252     D->_24 = nine_DP4_row_col(L, 1, R, 3);
2253 
2254     D->_31 = nine_DP4_row_col(L, 2, R, 0);
2255     D->_32 = nine_DP4_row_col(L, 2, R, 1);
2256     D->_33 = nine_DP4_row_col(L, 2, R, 2);
2257     D->_34 = nine_DP4_row_col(L, 2, R, 3);
2258 
2259     D->_41 = nine_DP4_row_col(L, 3, R, 0);
2260     D->_42 = nine_DP4_row_col(L, 3, R, 1);
2261     D->_43 = nine_DP4_row_col(L, 3, R, 2);
2262     D->_44 = nine_DP4_row_col(L, 3, R, 3);
2263 }
2264 
2265 void
nine_d3d_vector4_matrix_mul(D3DVECTOR * d,const D3DVECTOR * v,const D3DMATRIX * M)2266 nine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
2267 {
2268     d->x = nine_DP4_vec_col(v, M, 0);
2269     d->y = nine_DP4_vec_col(v, M, 1);
2270     d->z = nine_DP4_vec_col(v, M, 2);
2271 }
2272 
2273 void
nine_d3d_vector3_matrix_mul(D3DVECTOR * d,const D3DVECTOR * v,const D3DMATRIX * M)2274 nine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
2275 {
2276     d->x = nine_DP3_vec_col(v, M, 0);
2277     d->y = nine_DP3_vec_col(v, M, 1);
2278     d->z = nine_DP3_vec_col(v, M, 2);
2279 }
2280 
2281 void
nine_d3d_matrix_transpose(D3DMATRIX * D,const D3DMATRIX * M)2282 nine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
2283 {
2284     unsigned i, j;
2285     for (i = 0; i < 4; ++i)
2286     for (j = 0; j < 4; ++j)
2287         D->m[i][j] = M->m[j][i];
2288 }
2289 
2290 #define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
2291     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
2292     if (t > 0.0f) pos += t; else neg += t; } while(0)
2293 
2294 #define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
2295     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
2296     if (t > 0.0f) neg -= t; else pos -= t; } while(0)
2297 float
nine_d3d_matrix_det(const D3DMATRIX * M)2298 nine_d3d_matrix_det(const D3DMATRIX *M)
2299 {
2300     float pos = 0.0f;
2301     float neg = 0.0f;
2302 
2303     _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
2304     _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
2305     _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);
2306 
2307     _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
2308     _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
2309     _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);
2310 
2311     _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
2312     _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
2313     _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);
2314 
2315     _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
2316     _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
2317     _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);
2318 
2319     _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
2320     _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
2321     _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);
2322 
2323     _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
2324     _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
2325     _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);
2326 
2327     _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
2328     _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
2329     _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);
2330 
2331     _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
2332     _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
2333     _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);
2334 
2335     return pos + neg;
2336 }
2337 
2338 /* XXX: Probably better to just use src/mesa/math/m_matrix.c because
2339  * I have no idea where this code came from.
2340  */
2341 void
nine_d3d_matrix_inverse(D3DMATRIX * D,const D3DMATRIX * M)2342 nine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
2343 {
2344     int i, k;
2345     float det;
2346 
2347     D->m[0][0] =
2348         M->m[1][1] * M->m[2][2] * M->m[3][3] -
2349         M->m[1][1] * M->m[3][2] * M->m[2][3] -
2350         M->m[1][2] * M->m[2][1] * M->m[3][3] +
2351         M->m[1][2] * M->m[3][1] * M->m[2][3] +
2352         M->m[1][3] * M->m[2][1] * M->m[3][2] -
2353         M->m[1][3] * M->m[3][1] * M->m[2][2];
2354 
2355     D->m[0][1] =
2356        -M->m[0][1] * M->m[2][2] * M->m[3][3] +
2357         M->m[0][1] * M->m[3][2] * M->m[2][3] +
2358         M->m[0][2] * M->m[2][1] * M->m[3][3] -
2359         M->m[0][2] * M->m[3][1] * M->m[2][3] -
2360         M->m[0][3] * M->m[2][1] * M->m[3][2] +
2361         M->m[0][3] * M->m[3][1] * M->m[2][2];
2362 
2363     D->m[0][2] =
2364         M->m[0][1] * M->m[1][2] * M->m[3][3] -
2365         M->m[0][1] * M->m[3][2] * M->m[1][3] -
2366         M->m[0][2] * M->m[1][1] * M->m[3][3] +
2367         M->m[0][2] * M->m[3][1] * M->m[1][3] +
2368         M->m[0][3] * M->m[1][1] * M->m[3][2] -
2369         M->m[0][3] * M->m[3][1] * M->m[1][2];
2370 
2371     D->m[0][3] =
2372        -M->m[0][1] * M->m[1][2] * M->m[2][3] +
2373         M->m[0][1] * M->m[2][2] * M->m[1][3] +
2374         M->m[0][2] * M->m[1][1] * M->m[2][3] -
2375         M->m[0][2] * M->m[2][1] * M->m[1][3] -
2376         M->m[0][3] * M->m[1][1] * M->m[2][2] +
2377         M->m[0][3] * M->m[2][1] * M->m[1][2];
2378 
2379     D->m[1][0] =
2380        -M->m[1][0] * M->m[2][2] * M->m[3][3] +
2381         M->m[1][0] * M->m[3][2] * M->m[2][3] +
2382         M->m[1][2] * M->m[2][0] * M->m[3][3] -
2383         M->m[1][2] * M->m[3][0] * M->m[2][3] -
2384         M->m[1][3] * M->m[2][0] * M->m[3][2] +
2385         M->m[1][3] * M->m[3][0] * M->m[2][2];
2386 
2387     D->m[1][1] =
2388         M->m[0][0] * M->m[2][2] * M->m[3][3] -
2389         M->m[0][0] * M->m[3][2] * M->m[2][3] -
2390         M->m[0][2] * M->m[2][0] * M->m[3][3] +
2391         M->m[0][2] * M->m[3][0] * M->m[2][3] +
2392         M->m[0][3] * M->m[2][0] * M->m[3][2] -
2393         M->m[0][3] * M->m[3][0] * M->m[2][2];
2394 
2395     D->m[1][2] =
2396        -M->m[0][0] * M->m[1][2] * M->m[3][3] +
2397         M->m[0][0] * M->m[3][2] * M->m[1][3] +
2398         M->m[0][2] * M->m[1][0] * M->m[3][3] -
2399         M->m[0][2] * M->m[3][0] * M->m[1][3] -
2400         M->m[0][3] * M->m[1][0] * M->m[3][2] +
2401         M->m[0][3] * M->m[3][0] * M->m[1][2];
2402 
2403     D->m[1][3] =
2404         M->m[0][0] * M->m[1][2] * M->m[2][3] -
2405         M->m[0][0] * M->m[2][2] * M->m[1][3] -
2406         M->m[0][2] * M->m[1][0] * M->m[2][3] +
2407         M->m[0][2] * M->m[2][0] * M->m[1][3] +
2408         M->m[0][3] * M->m[1][0] * M->m[2][2] -
2409         M->m[0][3] * M->m[2][0] * M->m[1][2];
2410 
2411     D->m[2][0] =
2412         M->m[1][0] * M->m[2][1] * M->m[3][3] -
2413         M->m[1][0] * M->m[3][1] * M->m[2][3] -
2414         M->m[1][1] * M->m[2][0] * M->m[3][3] +
2415         M->m[1][1] * M->m[3][0] * M->m[2][3] +
2416         M->m[1][3] * M->m[2][0] * M->m[3][1] -
2417         M->m[1][3] * M->m[3][0] * M->m[2][1];
2418 
2419     D->m[2][1] =
2420        -M->m[0][0] * M->m[2][1] * M->m[3][3] +
2421         M->m[0][0] * M->m[3][1] * M->m[2][3] +
2422         M->m[0][1] * M->m[2][0] * M->m[3][3] -
2423         M->m[0][1] * M->m[3][0] * M->m[2][3] -
2424         M->m[0][3] * M->m[2][0] * M->m[3][1] +
2425         M->m[0][3] * M->m[3][0] * M->m[2][1];
2426 
2427     D->m[2][2] =
2428         M->m[0][0] * M->m[1][1] * M->m[3][3] -
2429         M->m[0][0] * M->m[3][1] * M->m[1][3] -
2430         M->m[0][1] * M->m[1][0] * M->m[3][3] +
2431         M->m[0][1] * M->m[3][0] * M->m[1][3] +
2432         M->m[0][3] * M->m[1][0] * M->m[3][1] -
2433         M->m[0][3] * M->m[3][0] * M->m[1][1];
2434 
2435     D->m[2][3] =
2436        -M->m[0][0] * M->m[1][1] * M->m[2][3] +
2437         M->m[0][0] * M->m[2][1] * M->m[1][3] +
2438         M->m[0][1] * M->m[1][0] * M->m[2][3] -
2439         M->m[0][1] * M->m[2][0] * M->m[1][3] -
2440         M->m[0][3] * M->m[1][0] * M->m[2][1] +
2441         M->m[0][3] * M->m[2][0] * M->m[1][1];
2442 
2443     D->m[3][0] =
2444        -M->m[1][0] * M->m[2][1] * M->m[3][2] +
2445         M->m[1][0] * M->m[3][1] * M->m[2][2] +
2446         M->m[1][1] * M->m[2][0] * M->m[3][2] -
2447         M->m[1][1] * M->m[3][0] * M->m[2][2] -
2448         M->m[1][2] * M->m[2][0] * M->m[3][1] +
2449         M->m[1][2] * M->m[3][0] * M->m[2][1];
2450 
2451     D->m[3][1] =
2452         M->m[0][0] * M->m[2][1] * M->m[3][2] -
2453         M->m[0][0] * M->m[3][1] * M->m[2][2] -
2454         M->m[0][1] * M->m[2][0] * M->m[3][2] +
2455         M->m[0][1] * M->m[3][0] * M->m[2][2] +
2456         M->m[0][2] * M->m[2][0] * M->m[3][1] -
2457         M->m[0][2] * M->m[3][0] * M->m[2][1];
2458 
2459     D->m[3][2] =
2460        -M->m[0][0] * M->m[1][1] * M->m[3][2] +
2461         M->m[0][0] * M->m[3][1] * M->m[1][2] +
2462         M->m[0][1] * M->m[1][0] * M->m[3][2] -
2463         M->m[0][1] * M->m[3][0] * M->m[1][2] -
2464         M->m[0][2] * M->m[1][0] * M->m[3][1] +
2465         M->m[0][2] * M->m[3][0] * M->m[1][1];
2466 
2467     D->m[3][3] =
2468         M->m[0][0] * M->m[1][1] * M->m[2][2] -
2469         M->m[0][0] * M->m[2][1] * M->m[1][2] -
2470         M->m[0][1] * M->m[1][0] * M->m[2][2] +
2471         M->m[0][1] * M->m[2][0] * M->m[1][2] +
2472         M->m[0][2] * M->m[1][0] * M->m[2][1] -
2473         M->m[0][2] * M->m[2][0] * M->m[1][1];
2474 
2475     det =
2476         M->m[0][0] * D->m[0][0] +
2477         M->m[1][0] * D->m[0][1] +
2478         M->m[2][0] * D->m[0][2] +
2479         M->m[3][0] * D->m[0][3];
2480 
2481     if (fabsf(det) < 1e-30) {/* non inversible */
2482         *D = *M; /* wine tests */
2483         return;
2484     }
2485 
2486     det = 1.0 / det;
2487 
2488     for (i = 0; i < 4; i++)
2489     for (k = 0; k < 4; k++)
2490         D->m[i][k] *= det;
2491 
2492 #ifdef DEBUG
2493     {
2494         D3DMATRIX I;
2495 
2496         nine_d3d_matrix_matrix_mul(&I, D, M);
2497 
2498         for (i = 0; i < 4; ++i)
2499         for (k = 0; k < 4; ++k)
2500             if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
2501                 DBG("Matrix inversion check FAILED !\n");
2502     }
2503 #endif
2504 }
2505