1//===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===//
2
3Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
4registers, to generate better spill code.
5
6//===----------------------------------------------------------------------===//
7
8The first should be a single lvx from the constant pool, the second should be
9a xor/stvx:
10
11void foo(void) {
12  int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
13  bar (x);
14}
15
16#include <string.h>
17void foo(void) {
18  int x[8] __attribute__((aligned(128)));
19  memset (x, 0, sizeof (x));
20  bar (x);
21}
22
23//===----------------------------------------------------------------------===//
24
25Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
26http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
27
28When -ffast-math is on, we can use 0.0.
29
30//===----------------------------------------------------------------------===//
31
32  Consider this:
33  v4f32 Vector;
34  v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X };
35
36Since we know that "Vector" is 16-byte aligned and we know the element offset
37of ".X", we should change the load into a lve*x instruction, instead of doing
38a load/store/lve*x sequence.
39
40//===----------------------------------------------------------------------===//
41
42For functions that use altivec AND have calls, we are VRSAVE'ing all call
43clobbered regs.
44
45//===----------------------------------------------------------------------===//
46
47Implement passing vectors by value into calls and receiving them as arguments.
48
49//===----------------------------------------------------------------------===//
50
51GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load
52of C1/C2/C3, then a load and vperm of Variable.
53
54//===----------------------------------------------------------------------===//
55
56We need a way to teach tblgen that some operands of an intrinsic are required to
57be constants.  The verifier should enforce this constraint.
58
59//===----------------------------------------------------------------------===//
60
61We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
62aligned stack slot, followed by a load/vperm.  We should probably just store it
63to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
64in memory this is a big win.
65
66//===----------------------------------------------------------------------===//
67
68extract_vector_elt of an arbitrary constant vector can be done with the
69following instructions:
70
71vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
72vec_ste(&destloc,0,vTemp);
73
74We can do an arbitrary non-constant value by using lvsr/perm/ste.
75
76//===----------------------------------------------------------------------===//
77
78If we want to tie instruction selection into the scheduler, we can do some
79constant formation with different instructions.  For example, we can generate
80"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
81"vsplti 0" or "vxor", each of which use different execution units, thus could
82help scheduling.
83
84This is probably only reasonable for a post-pass scheduler.
85
86//===----------------------------------------------------------------------===//
87
88For this function:
89
90void test(vector float *A, vector float *B) {
91  vector float C = (vector float)vec_cmpeq(*A, *B);
92  if (!vec_any_eq(*A, *B))
93    *B = (vector float){0,0,0,0};
94  *A = C;
95}
96
97we get the following basic block:
98
99	...
100        lvx v2, 0, r4
101        lvx v3, 0, r3
102        vcmpeqfp v4, v3, v2
103        vcmpeqfp. v2, v3, v2
104        bne cr6, LBB1_2 ; cond_next
105
106The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
107vcmpeqfp. result is used by a branch.  This can be improved.
108
109//===----------------------------------------------------------------------===//
110
111The code generated for this is truly aweful:
112
113vector float test(float a, float b) {
114 return (vector float){ 0.0, a, 0.0, 0.0};
115}
116
117LCPI1_0:                                        ;  float
118        .space  4
119        .text
120        .globl  _test
121        .align  4
122_test:
123        mfspr r2, 256
124        oris r3, r2, 4096
125        mtspr 256, r3
126        lis r3, ha16(LCPI1_0)
127        addi r4, r1, -32
128        stfs f1, -16(r1)
129        addi r5, r1, -16
130        lfs f0, lo16(LCPI1_0)(r3)
131        stfs f0, -32(r1)
132        lvx v2, 0, r4
133        lvx v3, 0, r5
134        vmrghw v3, v3, v2
135        vspltw v2, v2, 0
136        vmrghw v2, v2, v3
137        mtspr 256, r2
138        blr
139
140//===----------------------------------------------------------------------===//
141
142int foo(vector float *x, vector float *y) {
143        if (vec_all_eq(*x,*y)) return 3245;
144        else return 12;
145}
146
147A predicate compare being used in a select_cc should have the same peephole
148applied to it as a predicate compare used by a br_cc.  There should be no
149mfcr here:
150
151_foo:
152        mfspr r2, 256
153        oris r5, r2, 12288
154        mtspr 256, r5
155        li r5, 12
156        li r6, 3245
157        lvx v2, 0, r4
158        lvx v3, 0, r3
159        vcmpeqfp. v2, v3, v2
160        mfcr r3, 2
161        rlwinm r3, r3, 25, 31, 31
162        cmpwi cr0, r3, 0
163        bne cr0, LBB1_2 ; entry
164LBB1_1: ; entry
165        mr r6, r5
166LBB1_2: ; entry
167        mr r3, r6
168        mtspr 256, r2
169        blr
170
171//===----------------------------------------------------------------------===//
172
173CodeGen/PowerPC/vec_constants.ll has an and operation that should be
174codegen'd to andc.  The issue is that the 'all ones' build vector is
175SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
176which prevents the vnot pattern from matching.
177
178
179//===----------------------------------------------------------------------===//
180
181An alternative to the store/store/load approach for illegal insert element
182lowering would be:
183
1841. store element to any ol' slot
1852. lvx the slot
1863. lvsl 0; splat index; vcmpeq to generate a select mask
1874. lvsl slot + x; vperm to rotate result into correct slot
1885. vsel result together.
189
190//===----------------------------------------------------------------------===//
191
192Should codegen branches on vec_any/vec_all to avoid mfcr.  Two examples:
193
194#include <altivec.h>
195 int f(vector float a, vector float b)
196 {
197  int aa = 0;
198  if (vec_all_ge(a, b))
199    aa |= 0x1;
200  if (vec_any_ge(a,b))
201    aa |= 0x2;
202  return aa;
203}
204
205vector float f(vector float a, vector float b) {
206  if (vec_any_eq(a, b))
207    return a;
208  else
209    return b;
210}
211
212