1 /*
2  * Copyright 2003 VMware, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19  * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Keith Whitwell <keithw@vmware.com>
26  */
27 
28 
29 #include "pipe/p_config.h"
30 #include "pipe/p_compiler.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include "util/u_format.h"
34 
35 #include "translate.h"
36 
37 
38 #if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
39 
40 #include "rtasm/rtasm_cpu.h"
41 #include "rtasm/rtasm_x86sse.h"
42 
43 
44 #define X    0
45 #define Y    1
46 #define Z    2
47 #define W    3
48 
49 
50 struct translate_buffer
51 {
52    const void *base_ptr;
53    uintptr_t stride;
54    unsigned max_index;
55 };
56 
57 struct translate_buffer_variant
58 {
59    unsigned buffer_index;
60    unsigned instance_divisor;
61    void *ptr;                   /* updated either per vertex or per instance */
62 };
63 
64 
65 #define ELEMENT_BUFFER_INSTANCE_ID  1001
66 
67 #define NUM_CONSTS 7
68 
69 enum
70 {
71    CONST_IDENTITY,
72    CONST_INV_127,
73    CONST_INV_255,
74    CONST_INV_32767,
75    CONST_INV_65535,
76    CONST_INV_2147483647,
77    CONST_255
78 };
79 
80 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
81 static float consts[NUM_CONSTS][4] = {
82    {0, 0, 0, 1},
83    C(1.0 / 127.0),
84    C(1.0 / 255.0),
85    C(1.0 / 32767.0),
86    C(1.0 / 65535.0),
87    C(1.0 / 2147483647.0),
88    C(255.0)
89 };
90 
91 #undef C
92 
93 struct translate_sse
94 {
95    struct translate translate;
96 
97    struct x86_function linear_func;
98    struct x86_function elt_func;
99    struct x86_function elt16_func;
100    struct x86_function elt8_func;
101    struct x86_function *func;
102 
103      PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
104    int8_t reg_to_const[16];
105    int8_t const_to_reg[NUM_CONSTS];
106 
107    struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
108    unsigned nr_buffers;
109 
110    /* Multiple buffer variants can map to a single buffer. */
111    struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
112    unsigned nr_buffer_variants;
113 
114    /* Multiple elements can map to a single buffer variant. */
115    unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
116 
117    boolean use_instancing;
118    unsigned instance_id;
119    unsigned start_instance;
120 
121    /* these are actually known values, but putting them in a struct
122     * like this is helpful to keep them in sync across the file.
123     */
124    struct x86_reg tmp_EAX;
125    struct x86_reg tmp2_EDX;
126    struct x86_reg src_ECX;
127    struct x86_reg idx_ESI;      /* either start+i or &elt[i] */
128    struct x86_reg machine_EDI;
129    struct x86_reg outbuf_EBX;
130    struct x86_reg count_EBP;    /* decrements to zero */
131 };
132 
133 
134 static int
get_offset(const void * a,const void * b)135 get_offset(const void *a, const void *b)
136 {
137    return (const char *) b - (const char *) a;
138 }
139 
140 
141 static struct x86_reg
get_const(struct translate_sse * p,unsigned id)142 get_const(struct translate_sse *p, unsigned id)
143 {
144    struct x86_reg reg;
145    unsigned i;
146 
147    if (p->const_to_reg[id] >= 0)
148       return x86_make_reg(file_XMM, p->const_to_reg[id]);
149 
150    for (i = 2; i < 8; ++i) {
151       if (p->reg_to_const[i] < 0)
152          break;
153    }
154 
155    /* TODO: be smarter here */
156    if (i == 8)
157       --i;
158 
159    reg = x86_make_reg(file_XMM, i);
160 
161    if (p->reg_to_const[i] >= 0)
162       p->const_to_reg[p->reg_to_const[i]] = -1;
163 
164    p->reg_to_const[i] = id;
165    p->const_to_reg[id] = i;
166 
167    /* TODO: this should happen outside the loop, if possible */
168    sse_movaps(p->func, reg,
169               x86_make_disp(p->machine_EDI,
170                             get_offset(p, &p->consts[id][0])));
171 
172    return reg;
173 }
174 
175 
176 /* load the data in a SSE2 register, padding with zeros */
177 static boolean
emit_load_sse2(struct translate_sse * p,struct x86_reg data,struct x86_reg src,unsigned size)178 emit_load_sse2(struct translate_sse *p,
179                struct x86_reg data, struct x86_reg src, unsigned size)
180 {
181    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
182    struct x86_reg tmp = p->tmp_EAX;
183    switch (size) {
184    case 1:
185       x86_movzx8(p->func, tmp, src);
186       sse2_movd(p->func, data, tmp);
187       break;
188    case 2:
189       x86_movzx16(p->func, tmp, src);
190       sse2_movd(p->func, data, tmp);
191       break;
192    case 3:
193       x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
194       x86_shl_imm(p->func, tmp, 16);
195       x86_mov16(p->func, tmp, src);
196       sse2_movd(p->func, data, tmp);
197       break;
198    case 4:
199       sse2_movd(p->func, data, src);
200       break;
201    case 6:
202       sse2_movd(p->func, data, src);
203       x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
204       sse2_movd(p->func, tmpXMM, tmp);
205       sse2_punpckldq(p->func, data, tmpXMM);
206       break;
207    case 8:
208       sse2_movq(p->func, data, src);
209       break;
210    case 12:
211       sse2_movq(p->func, data, src);
212       sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
213       sse2_punpcklqdq(p->func, data, tmpXMM);
214       break;
215    case 16:
216       sse2_movdqu(p->func, data, src);
217       break;
218    default:
219       return FALSE;
220    }
221    return TRUE;
222 }
223 
224 
225 /* this value can be passed for the out_chans argument */
226 #define CHANNELS_0001 5
227 
228 
229 /* this function will load #chans float values, and will
230  * pad the register with zeroes at least up to out_chans.
231  *
232  * If out_chans is set to CHANNELS_0001, then the fourth
233  * value will be padded with 1. Only pass this value if
234  * chans < 4 or results are undefined.
235  */
236 static void
emit_load_float32(struct translate_sse * p,struct x86_reg data,struct x86_reg arg0,unsigned out_chans,unsigned chans)237 emit_load_float32(struct translate_sse *p, struct x86_reg data,
238                   struct x86_reg arg0, unsigned out_chans, unsigned chans)
239 {
240    switch (chans) {
241    case 1:
242       /* a 0 0 0
243        * a 0 0 1
244        */
245       sse_movss(p->func, data, arg0);
246       if (out_chans == CHANNELS_0001)
247          sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
248       break;
249    case 2:
250       /* 0 0 0 1
251        * a b 0 1
252        */
253       if (out_chans == CHANNELS_0001)
254          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
255                     SHUF(X, Y, Z, W));
256       else if (out_chans > 2)
257          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
258       sse_movlps(p->func, data, arg0);
259       break;
260    case 3:
261       /* Have to jump through some hoops:
262        *
263        * c 0 0 0
264        * c 0 0 1 if out_chans == CHANNELS_0001
265        * 0 0 c 0/1
266        * a b c 0/1
267        */
268       sse_movss(p->func, data, x86_make_disp(arg0, 8));
269       if (out_chans == CHANNELS_0001)
270          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271                     SHUF(X, Y, Z, W));
272       sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
273       sse_movlps(p->func, data, arg0);
274       break;
275    case 4:
276       sse_movups(p->func, data, arg0);
277       break;
278    }
279 }
280 
281 /* this function behaves like emit_load_float32, but loads
282    64-bit floating point numbers, converting them to 32-bit
283   ones */
284 static void
emit_load_float64to32(struct translate_sse * p,struct x86_reg data,struct x86_reg arg0,unsigned out_chans,unsigned chans)285 emit_load_float64to32(struct translate_sse *p, struct x86_reg data,
286                       struct x86_reg arg0, unsigned out_chans, unsigned chans)
287 {
288    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
289    switch (chans) {
290    case 1:
291       sse2_movsd(p->func, data, arg0);
292       if (out_chans > 1)
293          sse2_cvtpd2ps(p->func, data, data);
294       else
295          sse2_cvtsd2ss(p->func, data, data);
296       if (out_chans == CHANNELS_0001)
297          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
298                     SHUF(X, Y, Z, W));
299       break;
300    case 2:
301       sse2_movupd(p->func, data, arg0);
302       sse2_cvtpd2ps(p->func, data, data);
303       if (out_chans == CHANNELS_0001)
304          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
305                     SHUF(X, Y, Z, W));
306       else if (out_chans > 2)
307          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
308       break;
309    case 3:
310       sse2_movupd(p->func, data, arg0);
311       sse2_cvtpd2ps(p->func, data, data);
312       sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
313       if (out_chans > 3)
314          sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
315       else
316          sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
317       sse_movlhps(p->func, data, tmpXMM);
318       if (out_chans == CHANNELS_0001)
319          sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
320       break;
321    case 4:
322       sse2_movupd(p->func, data, arg0);
323       sse2_cvtpd2ps(p->func, data, data);
324       sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
325       sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
326       sse_movlhps(p->func, data, tmpXMM);
327       break;
328    }
329 }
330 
331 
332 static void
emit_mov64(struct translate_sse * p,struct x86_reg dst_gpr,struct x86_reg dst_xmm,struct x86_reg src_gpr,struct x86_reg src_xmm)333 emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
334            struct x86_reg dst_xmm, struct x86_reg src_gpr,
335            struct x86_reg src_xmm)
336 {
337    if (x86_target(p->func) != X86_32)
338       x64_mov64(p->func, dst_gpr, src_gpr);
339    else {
340       /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
341       if (x86_target_caps(p->func) & X86_SSE2)
342          sse2_movq(p->func, dst_xmm, src_xmm);
343       else
344          sse_movlps(p->func, dst_xmm, src_xmm);
345    }
346 }
347 
348 
349 static void
emit_load64(struct translate_sse * p,struct x86_reg dst_gpr,struct x86_reg dst_xmm,struct x86_reg src)350 emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
351             struct x86_reg dst_xmm, struct x86_reg src)
352 {
353    emit_mov64(p, dst_gpr, dst_xmm, src, src);
354 }
355 
356 
357 static void
emit_store64(struct translate_sse * p,struct x86_reg dst,struct x86_reg src_gpr,struct x86_reg src_xmm)358 emit_store64(struct translate_sse *p, struct x86_reg dst,
359              struct x86_reg src_gpr, struct x86_reg src_xmm)
360 {
361    emit_mov64(p, dst, dst, src_gpr, src_xmm);
362 }
363 
364 
365 static void
emit_mov128(struct translate_sse * p,struct x86_reg dst,struct x86_reg src)366 emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
367 {
368    if (x86_target_caps(p->func) & X86_SSE2)
369       sse2_movdqu(p->func, dst, src);
370    else
371       sse_movups(p->func, dst, src);
372 }
373 
374 
375 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
376  * but may or may not be good on older processors
377  * TODO: may perhaps want to use non-temporal stores here if possible
378  */
379 static void
emit_memcpy(struct translate_sse * p,struct x86_reg dst,struct x86_reg src,unsigned size)380 emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
381             unsigned size)
382 {
383    struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
384    struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
385    struct x86_reg dataGPR = p->tmp_EAX;
386    struct x86_reg dataGPR2 = p->tmp2_EDX;
387 
388    if (size < 8) {
389       switch (size) {
390       case 1:
391          x86_mov8(p->func, dataGPR, src);
392          x86_mov8(p->func, dst, dataGPR);
393          break;
394       case 2:
395          x86_mov16(p->func, dataGPR, src);
396          x86_mov16(p->func, dst, dataGPR);
397          break;
398       case 3:
399          x86_mov16(p->func, dataGPR, src);
400          x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
401          x86_mov16(p->func, dst, dataGPR);
402          x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
403          break;
404       case 4:
405          x86_mov(p->func, dataGPR, src);
406          x86_mov(p->func, dst, dataGPR);
407          break;
408       case 6:
409          x86_mov(p->func, dataGPR, src);
410          x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
411          x86_mov(p->func, dst, dataGPR);
412          x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
413          break;
414       }
415    }
416    else if (!(x86_target_caps(p->func) & X86_SSE)) {
417       unsigned i = 0;
418       assert((size & 3) == 0);
419       for (i = 0; i < size; i += 4) {
420          x86_mov(p->func, dataGPR, x86_make_disp(src, i));
421          x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
422       }
423    }
424    else {
425       switch (size) {
426       case 8:
427          emit_load64(p, dataGPR, dataXMM, src);
428          emit_store64(p, dst, dataGPR, dataXMM);
429          break;
430       case 12:
431          emit_load64(p, dataGPR2, dataXMM, src);
432          x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
433          emit_store64(p, dst, dataGPR2, dataXMM);
434          x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
435          break;
436       case 16:
437          emit_mov128(p, dataXMM, src);
438          emit_mov128(p, dst, dataXMM);
439          break;
440       case 24:
441          emit_mov128(p, dataXMM, src);
442          emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
443          emit_mov128(p, dst, dataXMM);
444          emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
445          break;
446       case 32:
447          emit_mov128(p, dataXMM, src);
448          emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
449          emit_mov128(p, dst, dataXMM);
450          emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
451          break;
452       default:
453          assert(0);
454       }
455    }
456 }
457 
458 static boolean
translate_attr_convert(struct translate_sse * p,const struct translate_element * a,struct x86_reg src,struct x86_reg dst)459 translate_attr_convert(struct translate_sse *p,
460                        const struct translate_element *a,
461                        struct x86_reg src, struct x86_reg dst)
462 {
463    const struct util_format_description *input_desc =
464       util_format_description(a->input_format);
465    const struct util_format_description *output_desc =
466       util_format_description(a->output_format);
467    unsigned i;
468    boolean id_swizzle = TRUE;
469    unsigned swizzle[4] =
470       { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
471         PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
472    unsigned needed_chans = 0;
473    unsigned imms[2] = { 0, 0x3f800000 };
474 
475    if (a->output_format == PIPE_FORMAT_NONE
476        || a->input_format == PIPE_FORMAT_NONE)
477       return FALSE;
478 
479    if (input_desc->channel[0].size & 7)
480       return FALSE;
481 
482    if (input_desc->colorspace != output_desc->colorspace)
483       return FALSE;
484 
485    for (i = 1; i < input_desc->nr_channels; ++i) {
486       if (memcmp
487           (&input_desc->channel[i], &input_desc->channel[0],
488            sizeof(input_desc->channel[0])))
489          return FALSE;
490    }
491 
492    for (i = 1; i < output_desc->nr_channels; ++i) {
493       if (memcmp
494           (&output_desc->channel[i], &output_desc->channel[0],
495            sizeof(output_desc->channel[0]))) {
496          return FALSE;
497       }
498    }
499 
500    for (i = 0; i < output_desc->nr_channels; ++i) {
501       if (output_desc->swizzle[i] < 4)
502          swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
503    }
504 
505    if ((x86_target_caps(p->func) & X86_SSE) &&
506        (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
507         || a->output_format == PIPE_FORMAT_R32G32_FLOAT
508         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
509         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
510       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
511 
512       for (i = 0; i < output_desc->nr_channels; ++i) {
513          if (swizzle[i] == PIPE_SWIZZLE_0
514              && i >= input_desc->nr_channels)
515             swizzle[i] = i;
516       }
517 
518       for (i = 0; i < output_desc->nr_channels; ++i) {
519          if (swizzle[i] < 4)
520             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
521          if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
522             id_swizzle = FALSE;
523       }
524 
525       if (needed_chans > 0) {
526          switch (input_desc->channel[0].type) {
527          case UTIL_FORMAT_TYPE_UNSIGNED:
528             if (!(x86_target_caps(p->func) & X86_SSE2))
529                return FALSE;
530             emit_load_sse2(p, dataXMM, src,
531                            input_desc->channel[0].size *
532                            input_desc->nr_channels >> 3);
533 
534             /* TODO: add support for SSE4.1 pmovzx */
535             switch (input_desc->channel[0].size) {
536             case 8:
537                /* TODO: this may be inefficient due to get_identity() being
538                 *  used both as a float and integer register.
539                 */
540                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
541                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
542                break;
543             case 16:
544                sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
545                break;
546             case 32:           /* we lose precision here */
547                sse2_psrld_imm(p->func, dataXMM, 1);
548                break;
549             default:
550                return FALSE;
551             }
552             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
553             if (input_desc->channel[0].normalized) {
554                struct x86_reg factor;
555                switch (input_desc->channel[0].size) {
556                case 8:
557                   factor = get_const(p, CONST_INV_255);
558                   break;
559                case 16:
560                   factor = get_const(p, CONST_INV_65535);
561                   break;
562                case 32:
563                   factor = get_const(p, CONST_INV_2147483647);
564                   break;
565                default:
566                   assert(0);
567                   factor.disp = 0;
568                   factor.file = 0;
569                   factor.idx = 0;
570                   factor.mod = 0;
571                   break;
572                }
573                sse_mulps(p->func, dataXMM, factor);
574             }
575             else if (input_desc->channel[0].size == 32)
576                /* compensate for the bit we threw away to fit u32 into s32 */
577                sse_addps(p->func, dataXMM, dataXMM);
578             break;
579          case UTIL_FORMAT_TYPE_SIGNED:
580             if (!(x86_target_caps(p->func) & X86_SSE2))
581                return FALSE;
582             emit_load_sse2(p, dataXMM, src,
583                            input_desc->channel[0].size *
584                            input_desc->nr_channels >> 3);
585 
586             /* TODO: add support for SSE4.1 pmovsx */
587             switch (input_desc->channel[0].size) {
588             case 8:
589                sse2_punpcklbw(p->func, dataXMM, dataXMM);
590                sse2_punpcklbw(p->func, dataXMM, dataXMM);
591                sse2_psrad_imm(p->func, dataXMM, 24);
592                break;
593             case 16:
594                sse2_punpcklwd(p->func, dataXMM, dataXMM);
595                sse2_psrad_imm(p->func, dataXMM, 16);
596                break;
597             case 32:           /* we lose precision here */
598                break;
599             default:
600                return FALSE;
601             }
602             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
603             if (input_desc->channel[0].normalized) {
604                struct x86_reg factor;
605                switch (input_desc->channel[0].size) {
606                case 8:
607                   factor = get_const(p, CONST_INV_127);
608                   break;
609                case 16:
610                   factor = get_const(p, CONST_INV_32767);
611                   break;
612                case 32:
613                   factor = get_const(p, CONST_INV_2147483647);
614                   break;
615                default:
616                   assert(0);
617                   factor.disp = 0;
618                   factor.file = 0;
619                   factor.idx = 0;
620                   factor.mod = 0;
621                   break;
622                }
623                sse_mulps(p->func, dataXMM, factor);
624             }
625             break;
626 
627             break;
628          case UTIL_FORMAT_TYPE_FLOAT:
629             if (input_desc->channel[0].size != 32
630                 && input_desc->channel[0].size != 64) {
631                return FALSE;
632             }
633             if (swizzle[3] == PIPE_SWIZZLE_1
634                 && input_desc->nr_channels <= 3) {
635                swizzle[3] = PIPE_SWIZZLE_W;
636                needed_chans = CHANNELS_0001;
637             }
638             switch (input_desc->channel[0].size) {
639             case 32:
640                emit_load_float32(p, dataXMM, src, needed_chans,
641                                  input_desc->nr_channels);
642                break;
643             case 64:           /* we lose precision here */
644                if (!(x86_target_caps(p->func) & X86_SSE2))
645                   return FALSE;
646                emit_load_float64to32(p, dataXMM, src, needed_chans,
647                                      input_desc->nr_channels);
648                break;
649             default:
650                return FALSE;
651             }
652             break;
653          default:
654             return FALSE;
655          }
656 
657          if (!id_swizzle) {
658             sse_shufps(p->func, dataXMM, dataXMM,
659                        SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
660          }
661       }
662 
663       if (output_desc->nr_channels >= 4
664           && swizzle[0] < PIPE_SWIZZLE_0
665           && swizzle[1] < PIPE_SWIZZLE_0
666           && swizzle[2] < PIPE_SWIZZLE_0
667           && swizzle[3] < PIPE_SWIZZLE_0) {
668          sse_movups(p->func, dst, dataXMM);
669       }
670       else {
671          if (output_desc->nr_channels >= 2
672              && swizzle[0] < PIPE_SWIZZLE_0
673              && swizzle[1] < PIPE_SWIZZLE_0) {
674             sse_movlps(p->func, dst, dataXMM);
675          }
676          else {
677             if (swizzle[0] < PIPE_SWIZZLE_0) {
678                sse_movss(p->func, dst, dataXMM);
679             }
680             else {
681                x86_mov_imm(p->func, dst,
682                            imms[swizzle[0] - PIPE_SWIZZLE_0]);
683             }
684 
685             if (output_desc->nr_channels >= 2) {
686                if (swizzle[1] < PIPE_SWIZZLE_0) {
687                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
688                   sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
689                }
690                else {
691                   x86_mov_imm(p->func, x86_make_disp(dst, 4),
692                               imms[swizzle[1] - PIPE_SWIZZLE_0]);
693                }
694             }
695          }
696 
697          if (output_desc->nr_channels >= 3) {
698             if (output_desc->nr_channels >= 4
699                 && swizzle[2] < PIPE_SWIZZLE_0
700                 && swizzle[3] < PIPE_SWIZZLE_0) {
701                sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
702             }
703             else {
704                if (swizzle[2] < PIPE_SWIZZLE_0) {
705                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
706                   sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
707                }
708                else {
709                   x86_mov_imm(p->func, x86_make_disp(dst, 8),
710                               imms[swizzle[2] - PIPE_SWIZZLE_0]);
711                }
712 
713                if (output_desc->nr_channels >= 4) {
714                   if (swizzle[3] < PIPE_SWIZZLE_0) {
715                      sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
716                      sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
717                   }
718                   else {
719                      x86_mov_imm(p->func, x86_make_disp(dst, 12),
720                                  imms[swizzle[3] - PIPE_SWIZZLE_0]);
721                   }
722                }
723             }
724          }
725       }
726       return TRUE;
727    }
728    else if ((x86_target_caps(p->func) & X86_SSE2)
729             && input_desc->channel[0].size == 8
730             && output_desc->channel[0].size == 16
731             && output_desc->channel[0].normalized ==
732             input_desc->channel[0].normalized &&
733             (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
734                    && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
735              || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
736                  && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
737              || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
738                  && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
739       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
740       struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
741       struct x86_reg tmp = p->tmp_EAX;
742       unsigned imms[2] = { 0, 1 };
743 
744       for (i = 0; i < output_desc->nr_channels; ++i) {
745          if (swizzle[i] == PIPE_SWIZZLE_0
746              && i >= input_desc->nr_channels) {
747             swizzle[i] = i;
748          }
749       }
750 
751       for (i = 0; i < output_desc->nr_channels; ++i) {
752          if (swizzle[i] < 4)
753             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
754          if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
755             id_swizzle = FALSE;
756       }
757 
758       if (needed_chans > 0) {
759          emit_load_sse2(p, dataXMM, src,
760                         input_desc->channel[0].size *
761                         input_desc->nr_channels >> 3);
762 
763          switch (input_desc->channel[0].type) {
764          case UTIL_FORMAT_TYPE_UNSIGNED:
765             if (input_desc->channel[0].normalized) {
766                sse2_punpcklbw(p->func, dataXMM, dataXMM);
767                if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
768                   sse2_psrlw_imm(p->func, dataXMM, 1);
769             }
770             else
771                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
772             break;
773          case UTIL_FORMAT_TYPE_SIGNED:
774             if (input_desc->channel[0].normalized) {
775                sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
776                sse2_punpcklbw(p->func, tmpXMM, dataXMM);
777                sse2_psllw_imm(p->func, dataXMM, 9);
778                sse2_psrlw_imm(p->func, dataXMM, 8);
779                sse2_por(p->func, tmpXMM, dataXMM);
780                sse2_psrlw_imm(p->func, dataXMM, 7);
781                sse2_por(p->func, tmpXMM, dataXMM);
782                {
783                   struct x86_reg t = dataXMM;
784                   dataXMM = tmpXMM;
785                   tmpXMM = t;
786                }
787             }
788             else {
789                sse2_punpcklbw(p->func, dataXMM, dataXMM);
790                sse2_psraw_imm(p->func, dataXMM, 8);
791             }
792             break;
793          default:
794             assert(0);
795          }
796 
797          if (output_desc->channel[0].normalized)
798             imms[1] =
799                (output_desc->channel[0].type ==
800                 UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
801 
802          if (!id_swizzle)
803             sse2_pshuflw(p->func, dataXMM, dataXMM,
804                          (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
805                          ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
806       }
807 
808       if (output_desc->nr_channels >= 4
809           && swizzle[0] < PIPE_SWIZZLE_0
810           && swizzle[1] < PIPE_SWIZZLE_0
811           && swizzle[2] < PIPE_SWIZZLE_0
812           && swizzle[3] < PIPE_SWIZZLE_0) {
813          sse2_movq(p->func, dst, dataXMM);
814       }
815       else {
816          if (swizzle[0] < PIPE_SWIZZLE_0) {
817             if (output_desc->nr_channels >= 2
818                 && swizzle[1] < PIPE_SWIZZLE_0) {
819                sse2_movd(p->func, dst, dataXMM);
820             }
821             else {
822                sse2_movd(p->func, tmp, dataXMM);
823                x86_mov16(p->func, dst, tmp);
824                if (output_desc->nr_channels >= 2)
825                   x86_mov16_imm(p->func, x86_make_disp(dst, 2),
826                                 imms[swizzle[1] - PIPE_SWIZZLE_0]);
827             }
828          }
829          else {
830             if (output_desc->nr_channels >= 2
831                 && swizzle[1] >= PIPE_SWIZZLE_0) {
832                x86_mov_imm(p->func, dst,
833                            (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
834                            imms[swizzle[0] - PIPE_SWIZZLE_0]);
835             }
836             else {
837                x86_mov16_imm(p->func, dst,
838                              imms[swizzle[0] - PIPE_SWIZZLE_0]);
839                if (output_desc->nr_channels >= 2) {
840                   sse2_movd(p->func, tmp, dataXMM);
841                   x86_shr_imm(p->func, tmp, 16);
842                   x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
843                }
844             }
845          }
846 
847          if (output_desc->nr_channels >= 3) {
848             if (swizzle[2] < PIPE_SWIZZLE_0) {
849                if (output_desc->nr_channels >= 4
850                    && swizzle[3] < PIPE_SWIZZLE_0) {
851                   sse2_psrlq_imm(p->func, dataXMM, 32);
852                   sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
853                }
854                else {
855                   sse2_psrlq_imm(p->func, dataXMM, 32);
856                   sse2_movd(p->func, tmp, dataXMM);
857                   x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
858                   if (output_desc->nr_channels >= 4) {
859                      x86_mov16_imm(p->func, x86_make_disp(dst, 6),
860                                    imms[swizzle[3] - PIPE_SWIZZLE_0]);
861                   }
862                }
863             }
864             else {
865                if (output_desc->nr_channels >= 4
866                    && swizzle[3] >= PIPE_SWIZZLE_0) {
867                   x86_mov_imm(p->func, x86_make_disp(dst, 4),
868                               (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
869                               | imms[swizzle[2] - PIPE_SWIZZLE_0]);
870                }
871                else {
872                   x86_mov16_imm(p->func, x86_make_disp(dst, 4),
873                                 imms[swizzle[2] - PIPE_SWIZZLE_0]);
874 
875                   if (output_desc->nr_channels >= 4) {
876                      sse2_psrlq_imm(p->func, dataXMM, 48);
877                      sse2_movd(p->func, tmp, dataXMM);
878                      x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
879                   }
880                }
881             }
882          }
883       }
884       return TRUE;
885    }
886    else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
887                     sizeof(output_desc->channel[0]))) {
888       struct x86_reg tmp = p->tmp_EAX;
889       unsigned i;
890 
891       if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
892           && output_desc->nr_channels == 4
893           && swizzle[0] == PIPE_SWIZZLE_W
894           && swizzle[1] == PIPE_SWIZZLE_Z
895           && swizzle[2] == PIPE_SWIZZLE_Y
896           && swizzle[3] == PIPE_SWIZZLE_X) {
897          /* TODO: support movbe */
898          x86_mov(p->func, tmp, src);
899          x86_bswap(p->func, tmp);
900          x86_mov(p->func, dst, tmp);
901          return TRUE;
902       }
903 
904       for (i = 0; i < output_desc->nr_channels; ++i) {
905          switch (output_desc->channel[0].size) {
906          case 8:
907             if (swizzle[i] >= PIPE_SWIZZLE_0) {
908                unsigned v = 0;
909                if (swizzle[i] == PIPE_SWIZZLE_1) {
910                   switch (output_desc->channel[0].type) {
911                   case UTIL_FORMAT_TYPE_UNSIGNED:
912                      v = output_desc->channel[0].normalized ? 0xff : 1;
913                      break;
914                   case UTIL_FORMAT_TYPE_SIGNED:
915                      v = output_desc->channel[0].normalized ? 0x7f : 1;
916                      break;
917                   default:
918                      return FALSE;
919                   }
920                }
921                x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
922             }
923             else {
924                x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
925                x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
926             }
927             break;
928          case 16:
929             if (swizzle[i] >= PIPE_SWIZZLE_0) {
930                unsigned v = 0;
931                if (swizzle[i] == PIPE_SWIZZLE_1) {
932                   switch (output_desc->channel[1].type) {
933                   case UTIL_FORMAT_TYPE_UNSIGNED:
934                      v = output_desc->channel[1].normalized ? 0xffff : 1;
935                      break;
936                   case UTIL_FORMAT_TYPE_SIGNED:
937                      v = output_desc->channel[1].normalized ? 0x7fff : 1;
938                      break;
939                   case UTIL_FORMAT_TYPE_FLOAT:
940                      v = 0x3c00;
941                      break;
942                   default:
943                      return FALSE;
944                   }
945                }
946                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
947             }
948             else if (swizzle[i] == PIPE_SWIZZLE_0) {
949                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
950             }
951             else {
952                x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
953                x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
954             }
955             break;
956          case 32:
957             if (swizzle[i] >= PIPE_SWIZZLE_0) {
958                unsigned v = 0;
959                if (swizzle[i] == PIPE_SWIZZLE_1) {
960                   switch (output_desc->channel[1].type) {
961                   case UTIL_FORMAT_TYPE_UNSIGNED:
962                      v = output_desc->channel[1].normalized ? 0xffffffff : 1;
963                      break;
964                   case UTIL_FORMAT_TYPE_SIGNED:
965                      v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
966                      break;
967                   case UTIL_FORMAT_TYPE_FLOAT:
968                      v = 0x3f800000;
969                      break;
970                   default:
971                      return FALSE;
972                   }
973                }
974                x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
975             }
976             else {
977                x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
978                x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
979             }
980             break;
981          case 64:
982             if (swizzle[i] >= PIPE_SWIZZLE_0) {
983                unsigned l = 0;
984                unsigned h = 0;
985                if (swizzle[i] == PIPE_SWIZZLE_1) {
986                   switch (output_desc->channel[1].type) {
987                   case UTIL_FORMAT_TYPE_UNSIGNED:
988                      h = output_desc->channel[1].normalized ? 0xffffffff : 0;
989                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
990                      break;
991                   case UTIL_FORMAT_TYPE_SIGNED:
992                      h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
993                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
994                      break;
995                   case UTIL_FORMAT_TYPE_FLOAT:
996                      h = 0x3ff00000;
997                      l = 0;
998                      break;
999                   default:
1000                      return FALSE;
1001                   }
1002                }
1003                x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1004                x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1005             }
1006             else {
1007                if (x86_target_caps(p->func) & X86_SSE) {
1008                   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1009                   emit_load64(p, tmp, tmpXMM,
1010                               x86_make_disp(src, swizzle[i] * 8));
1011                   emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1012                }
1013                else {
1014                   x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1015                   x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1016                   x86_mov(p->func, tmp,
1017                           x86_make_disp(src, swizzle[i] * 8 + 4));
1018                   x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1019                }
1020             }
1021             break;
1022          default:
1023             return FALSE;
1024          }
1025       }
1026       return TRUE;
1027    }
1028    /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1029    else if ((x86_target_caps(p->func) & X86_SSE2) &&
1030             a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1031             (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1032              || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1033       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1034 
1035       /* load */
1036       sse_movups(p->func, dataXMM, src);
1037 
1038       if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1039          sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1040       }
1041 
1042       /* scale by 255.0 */
1043       sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1044 
1045       /* pack and emit */
1046       sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1047       sse2_packssdw(p->func, dataXMM, dataXMM);
1048       sse2_packuswb(p->func, dataXMM, dataXMM);
1049       sse2_movd(p->func, dst, dataXMM);
1050 
1051       return TRUE;
1052    }
1053 
1054    return FALSE;
1055 }
1056 
1057 
1058 static boolean
translate_attr(struct translate_sse * p,const struct translate_element * a,struct x86_reg src,struct x86_reg dst)1059 translate_attr(struct translate_sse *p,
1060                const struct translate_element *a,
1061                struct x86_reg src, struct x86_reg dst)
1062 {
1063    if (a->input_format == a->output_format) {
1064       emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1065       return TRUE;
1066    }
1067 
1068    return translate_attr_convert(p, a, src, dst);
1069 }
1070 
1071 
1072 static boolean
init_inputs(struct translate_sse * p,unsigned index_size)1073 init_inputs(struct translate_sse *p, unsigned index_size)
1074 {
1075    unsigned i;
1076    struct x86_reg instance_id =
1077       x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1078    struct x86_reg start_instance =
1079       x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1080 
1081    for (i = 0; i < p->nr_buffer_variants; i++) {
1082       struct translate_buffer_variant *variant = &p->buffer_variant[i];
1083       struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1084 
1085       if (!index_size || variant->instance_divisor) {
1086          struct x86_reg buf_max_index =
1087             x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1088          struct x86_reg buf_stride =
1089             x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1090          struct x86_reg buf_ptr =
1091             x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1092          struct x86_reg buf_base_ptr =
1093             x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1094          struct x86_reg elt = p->idx_ESI;
1095          struct x86_reg tmp_EAX = p->tmp_EAX;
1096 
1097          /* Calculate pointer to first attrib:
1098           *   base_ptr + stride * index, where index depends on instance divisor
1099           */
1100          if (variant->instance_divisor) {
1101             struct x86_reg tmp_EDX = p->tmp2_EDX;
1102 
1103             /* Start with instance = instance_id
1104              * which is true if divisor is 1.
1105              */
1106             x86_mov(p->func, tmp_EAX, instance_id);
1107 
1108             if (variant->instance_divisor != 1) {
1109                struct x86_reg tmp_ECX = p->src_ECX;
1110 
1111                /* TODO: Add x86_shr() to rtasm and use it whenever
1112                 *       instance divisor is power of two.
1113                 */
1114                x86_xor(p->func, tmp_EDX, tmp_EDX);
1115                x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1116                x86_div(p->func, tmp_ECX);       /* EAX = EDX:EAX / ECX */
1117             }
1118 
1119             /* instance = (instance_id / divisor) + start_instance
1120              */
1121             x86_mov(p->func, tmp_EDX, start_instance);
1122             x86_add(p->func, tmp_EAX, tmp_EDX);
1123 
1124             /* XXX we need to clamp the index here too, but to a
1125              * per-array max value, not the draw->pt.max_index value
1126              * that's being given to us via translate->set_buffer().
1127              */
1128          }
1129          else {
1130             x86_mov(p->func, tmp_EAX, elt);
1131 
1132             /* Clamp to max_index
1133              */
1134             x86_cmp(p->func, tmp_EAX, buf_max_index);
1135             x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1136          }
1137 
1138          x86_mov(p->func, p->tmp2_EDX, buf_stride);
1139          x64_rexw(p->func);
1140          x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1141          x64_rexw(p->func);
1142          x86_add(p->func, tmp_EAX, buf_base_ptr);
1143 
1144          x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1145 
1146          /* In the linear case, keep the buffer pointer instead of the
1147           * index number.
1148           */
1149          if (!index_size && p->nr_buffer_variants == 1) {
1150             x64_rexw(p->func);
1151             x86_mov(p->func, elt, tmp_EAX);
1152          }
1153          else {
1154             x64_rexw(p->func);
1155             x86_mov(p->func, buf_ptr, tmp_EAX);
1156          }
1157       }
1158    }
1159 
1160    return TRUE;
1161 }
1162 
1163 
1164 static struct x86_reg
get_buffer_ptr(struct translate_sse * p,unsigned index_size,unsigned var_idx,struct x86_reg elt)1165 get_buffer_ptr(struct translate_sse *p,
1166                unsigned index_size, unsigned var_idx, struct x86_reg elt)
1167 {
1168    if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1169       return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1170    }
1171    if (!index_size && p->nr_buffer_variants == 1) {
1172       return p->idx_ESI;
1173    }
1174    else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1175       struct x86_reg ptr = p->src_ECX;
1176       struct x86_reg buf_ptr =
1177          x86_make_disp(p->machine_EDI,
1178                        get_offset(p, &p->buffer_variant[var_idx].ptr));
1179 
1180       x64_rexw(p->func);
1181       x86_mov(p->func, ptr, buf_ptr);
1182       return ptr;
1183    }
1184    else {
1185       struct x86_reg ptr = p->src_ECX;
1186       const struct translate_buffer_variant *variant =
1187          &p->buffer_variant[var_idx];
1188       struct x86_reg buf_stride =
1189          x86_make_disp(p->machine_EDI,
1190                        get_offset(p, &p->buffer[variant->buffer_index].stride));
1191       struct x86_reg buf_base_ptr =
1192          x86_make_disp(p->machine_EDI,
1193                   get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1194       struct x86_reg buf_max_index =
1195          x86_make_disp(p->machine_EDI,
1196                   get_offset(p, &p->buffer[variant->buffer_index].max_index));
1197 
1198       /* Calculate pointer to current attrib:
1199        */
1200       switch (index_size) {
1201       case 1:
1202          x86_movzx8(p->func, ptr, elt);
1203          break;
1204       case 2:
1205          x86_movzx16(p->func, ptr, elt);
1206          break;
1207       case 4:
1208          x86_mov(p->func, ptr, elt);
1209          break;
1210       }
1211 
1212       /* Clamp to max_index
1213        */
1214       x86_cmp(p->func, ptr, buf_max_index);
1215       x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1216 
1217       x86_mov(p->func, p->tmp2_EDX, buf_stride);
1218       x64_rexw(p->func);
1219       x86_imul(p->func, ptr, p->tmp2_EDX);
1220       x64_rexw(p->func);
1221       x86_add(p->func, ptr, buf_base_ptr);
1222       return ptr;
1223    }
1224 }
1225 
1226 
1227 static boolean
incr_inputs(struct translate_sse * p,unsigned index_size)1228 incr_inputs(struct translate_sse *p, unsigned index_size)
1229 {
1230    if (!index_size && p->nr_buffer_variants == 1) {
1231       const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1232       struct x86_reg stride =
1233          x86_make_disp(p->machine_EDI,
1234                        get_offset(p, &p->buffer[buffer_index].stride));
1235 
1236       if (p->buffer_variant[0].instance_divisor == 0) {
1237          x64_rexw(p->func);
1238          x86_add(p->func, p->idx_ESI, stride);
1239          sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1240       }
1241    }
1242    else if (!index_size) {
1243       unsigned i;
1244 
1245       /* Is this worthwhile??
1246        */
1247       for (i = 0; i < p->nr_buffer_variants; i++) {
1248          struct translate_buffer_variant *variant = &p->buffer_variant[i];
1249          struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1250                                                 get_offset(p, &variant->ptr));
1251       struct x86_reg buf_stride =
1252          x86_make_disp(p->machine_EDI,
1253                        get_offset(p, &p->buffer[variant->buffer_index].stride));
1254 
1255          if (variant->instance_divisor == 0) {
1256             x86_mov(p->func, p->tmp_EAX, buf_stride);
1257             x64_rexw(p->func);
1258             x86_add(p->func, p->tmp_EAX, buf_ptr);
1259             if (i == 0)
1260                sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1261             x64_rexw(p->func);
1262             x86_mov(p->func, buf_ptr, p->tmp_EAX);
1263          }
1264       }
1265    }
1266    else {
1267       x64_rexw(p->func);
1268       x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1269    }
1270 
1271    return TRUE;
1272 }
1273 
1274 
1275 /* Build run( struct translate *machine,
1276  *            unsigned start,
1277  *            unsigned count,
1278  *            void *output_buffer )
1279  * or
1280  *  run_elts( struct translate *machine,
1281  *            unsigned *elts,
1282  *            unsigned count,
1283  *            void *output_buffer )
1284  *
1285  *  Lots of hardcoding
1286  *
1287  * EAX -- pointer to current output vertex
1288  * ECX -- pointer to current attribute
1289  *
1290  */
1291 static boolean
build_vertex_emit(struct translate_sse * p,struct x86_function * func,unsigned index_size)1292 build_vertex_emit(struct translate_sse *p,
1293                   struct x86_function *func, unsigned index_size)
1294 {
1295    int fixup, label;
1296    unsigned j;
1297 
1298    memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1299    memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1300 
1301    p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1302    p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1303    p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1304    p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1305    p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1306    p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1307    p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1308 
1309    p->func = func;
1310 
1311    x86_init_func(p->func);
1312 
1313    if (x86_target(p->func) == X86_64_WIN64_ABI) {
1314       /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1315        * above the return address
1316        */
1317       sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1318                   x86_make_reg(file_XMM, 6));
1319       sse2_movdqa(p->func,
1320                   x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1321                   x86_make_reg(file_XMM, 7));
1322    }
1323 
1324    x86_push(p->func, p->outbuf_EBX);
1325    x86_push(p->func, p->count_EBP);
1326 
1327    /* on non-Win64 x86-64, these are already in the right registers */
1328    if (x86_target(p->func) != X86_64_STD_ABI) {
1329       x86_push(p->func, p->machine_EDI);
1330       x86_push(p->func, p->idx_ESI);
1331 
1332       if (x86_target(p->func) != X86_32) {
1333          x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1334          x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1335       }
1336       else {
1337          x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1338          x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1339       }
1340    }
1341 
1342    x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1343 
1344    if (x86_target(p->func) != X86_32)
1345       x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1346    else
1347       x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1348 
1349    /* Load instance ID.
1350     */
1351    if (p->use_instancing) {
1352       x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1353       x86_mov(p->func,
1354               x86_make_disp(p->machine_EDI,
1355                             get_offset(p, &p->start_instance)), p->tmp2_EDX);
1356 
1357       x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1358       x86_mov(p->func,
1359               x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1360               p->tmp_EAX);
1361    }
1362 
1363    /* Get vertex count, compare to zero
1364     */
1365    x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1366    x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1367    fixup = x86_jcc_forward(p->func, cc_E);
1368 
1369    /* always load, needed or not:
1370     */
1371    init_inputs(p, index_size);
1372 
1373    /* Note address for loop jump
1374     */
1375    label = x86_get_label(p->func);
1376    {
1377       struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1378       int last_variant = -1;
1379       struct x86_reg vb;
1380 
1381       for (j = 0; j < p->translate.key.nr_elements; j++) {
1382          const struct translate_element *a = &p->translate.key.element[j];
1383          unsigned variant = p->element_to_buffer_variant[j];
1384 
1385          /* Figure out source pointer address:
1386           */
1387          if (variant != last_variant) {
1388             last_variant = variant;
1389             vb = get_buffer_ptr(p, index_size, variant, elt);
1390          }
1391 
1392          if (!translate_attr(p, a,
1393                              x86_make_disp(vb, a->input_offset),
1394                              x86_make_disp(p->outbuf_EBX, a->output_offset)))
1395             return FALSE;
1396       }
1397 
1398       /* Next output vertex:
1399        */
1400       x64_rexw(p->func);
1401       x86_lea(p->func, p->outbuf_EBX,
1402               x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1403 
1404       /* Incr index
1405        */
1406       incr_inputs(p, index_size);
1407    }
1408 
1409    /* decr count, loop if not zero
1410     */
1411    x86_dec(p->func, p->count_EBP);
1412    x86_jcc(p->func, cc_NZ, label);
1413 
1414    /* Exit mmx state?
1415     */
1416    if (p->func->need_emms)
1417       mmx_emms(p->func);
1418 
1419    /* Land forward jump here:
1420     */
1421    x86_fixup_fwd_jump(p->func, fixup);
1422 
1423    /* Pop regs and return
1424     */
1425    if (x86_target(p->func) != X86_64_STD_ABI) {
1426       x86_pop(p->func, p->idx_ESI);
1427       x86_pop(p->func, p->machine_EDI);
1428    }
1429 
1430    x86_pop(p->func, p->count_EBP);
1431    x86_pop(p->func, p->outbuf_EBX);
1432 
1433    if (x86_target(p->func) == X86_64_WIN64_ABI) {
1434       sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1435                   x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1436       sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1437                   x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1438    }
1439    x86_ret(p->func);
1440 
1441    return TRUE;
1442 }
1443 
1444 
1445 static void
translate_sse_set_buffer(struct translate * translate,unsigned buf,const void * ptr,unsigned stride,unsigned max_index)1446 translate_sse_set_buffer(struct translate *translate,
1447                          unsigned buf,
1448                          const void *ptr, unsigned stride, unsigned max_index)
1449 {
1450    struct translate_sse *p = (struct translate_sse *) translate;
1451 
1452    if (buf < p->nr_buffers) {
1453       p->buffer[buf].base_ptr = (char *) ptr;
1454       p->buffer[buf].stride = stride;
1455       p->buffer[buf].max_index = max_index;
1456    }
1457 
1458    if (0)
1459       debug_printf("%s %d/%d: %p %d\n",
1460                    __FUNCTION__, buf, p->nr_buffers, ptr, stride);
1461 }
1462 
1463 
1464 static void
translate_sse_release(struct translate * translate)1465 translate_sse_release(struct translate *translate)
1466 {
1467    struct translate_sse *p = (struct translate_sse *) translate;
1468 
1469    x86_release_func(&p->elt8_func);
1470    x86_release_func(&p->elt16_func);
1471    x86_release_func(&p->elt_func);
1472    x86_release_func(&p->linear_func);
1473 
1474    os_free_aligned(p);
1475 }
1476 
1477 
1478 struct translate *
translate_sse2_create(const struct translate_key * key)1479 translate_sse2_create(const struct translate_key *key)
1480 {
1481    struct translate_sse *p = NULL;
1482    unsigned i;
1483 
1484    /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1485    if (!rtasm_cpu_has_sse())
1486       goto fail;
1487 
1488    p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1489    if (!p)
1490       goto fail;
1491 
1492    memset(p, 0, sizeof(*p));
1493    memcpy(p->consts, consts, sizeof(consts));
1494 
1495    p->translate.key = *key;
1496    p->translate.release = translate_sse_release;
1497    p->translate.set_buffer = translate_sse_set_buffer;
1498 
1499    assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1500 
1501    for (i = 0; i < key->nr_elements; i++) {
1502       if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1503          unsigned j;
1504 
1505          p->nr_buffers =
1506             MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1507 
1508          if (key->element[i].instance_divisor) {
1509             p->use_instancing = TRUE;
1510          }
1511 
1512          /*
1513           * Map vertex element to vertex buffer variant.
1514           */
1515          for (j = 0; j < p->nr_buffer_variants; j++) {
1516             if (p->buffer_variant[j].buffer_index ==
1517                 key->element[i].input_buffer
1518                 && p->buffer_variant[j].instance_divisor ==
1519                 key->element[i].instance_divisor) {
1520                break;
1521             }
1522          }
1523          if (j == p->nr_buffer_variants) {
1524             p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1525             p->buffer_variant[j].instance_divisor =
1526                key->element[i].instance_divisor;
1527             p->nr_buffer_variants++;
1528          }
1529          p->element_to_buffer_variant[i] = j;
1530       }
1531       else {
1532          assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1533 
1534          p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1535       }
1536    }
1537 
1538    if (0)
1539       debug_printf("nr_buffers: %d\n", p->nr_buffers);
1540 
1541    if (!build_vertex_emit(p, &p->linear_func, 0))
1542       goto fail;
1543 
1544    if (!build_vertex_emit(p, &p->elt_func, 4))
1545       goto fail;
1546 
1547    if (!build_vertex_emit(p, &p->elt16_func, 2))
1548       goto fail;
1549 
1550    if (!build_vertex_emit(p, &p->elt8_func, 1))
1551       goto fail;
1552 
1553    p->translate.run = (run_func) x86_get_func(&p->linear_func);
1554    if (p->translate.run == NULL)
1555       goto fail;
1556 
1557    p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1558    if (p->translate.run_elts == NULL)
1559       goto fail;
1560 
1561    p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1562    if (p->translate.run_elts16 == NULL)
1563       goto fail;
1564 
1565    p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1566    if (p->translate.run_elts8 == NULL)
1567       goto fail;
1568 
1569    return &p->translate;
1570 
1571  fail:
1572    if (p)
1573       translate_sse_release(&p->translate);
1574 
1575    return NULL;
1576 }
1577 
1578 
1579 #else
1580 
1581 struct translate *
translate_sse2_create(const struct translate_key * key)1582 translate_sse2_create(const struct translate_key *key)
1583 {
1584    return NULL;
1585 }
1586 
1587 #endif
1588