1 /*
2  * Copyright (C) 2019 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  */
24 
25 #include "util/u_math.h"
26 #include "midgard_pack.h"
27 #include "pan_encoder.h"
28 
29 /* This file handles attribute descriptors. The
30  * bulk of the complexity is from instancing. See mali_job for
31  * notes on how this works. But basically, for small vertex
32  * counts, we have a lookup table, and for large vertex counts,
33  * we look at the high bits as a heuristic. This has to match
34  * exactly how the hardware calculates this (which is why the
35  * algorithm is so weird) or else instancing will break. */
36 
37 /* Given an odd number (of the form 2k + 1), compute k */
38 #define ODD(odd) ((odd - 1) >> 1)
39 
40 static unsigned
panfrost_small_padded_vertex_count(unsigned idx)41 panfrost_small_padded_vertex_count(unsigned idx)
42 {
43         if (idx < 10)
44                 return idx;
45         else
46                 return (idx + 1) & ~1;
47 }
48 
49 static unsigned
panfrost_large_padded_vertex_count(uint32_t vertex_count)50 panfrost_large_padded_vertex_count(uint32_t vertex_count)
51 {
52         /* First, we have to find the highest set one */
53         unsigned highest = 32 - __builtin_clz(vertex_count);
54 
55         /* Using that, we mask out the highest 4-bits */
56         unsigned n = highest - 4;
57         unsigned nibble = (vertex_count >> n) & 0xF;
58 
59         /* Great, we have the nibble. Now we can just try possibilities. Note
60          * that we don't care about the bottom most bit in most cases, and we
61          * know the top bit must be 1 */
62 
63         unsigned middle_two = (nibble >> 1) & 0x3;
64 
65         switch (middle_two) {
66         case 0b00:
67                 if (!(nibble & 1))
68                         return (1 << n) * 9;
69                 else
70                         return (1 << (n + 1)) * 5;
71         case 0b01:
72                 return (1 << (n + 2)) * 3;
73         case 0b10:
74                 return (1 << (n + 1)) * 7;
75         case 0b11:
76                 return (1 << (n + 4));
77         default:
78                 return 0; /* unreachable */
79         }
80 }
81 
82 unsigned
panfrost_padded_vertex_count(unsigned vertex_count)83 panfrost_padded_vertex_count(unsigned vertex_count)
84 {
85         if (vertex_count < 20)
86                 return panfrost_small_padded_vertex_count(vertex_count);
87         else
88                 return panfrost_large_padded_vertex_count(vertex_count);
89 }
90 
91 /* The much, much more irritating case -- instancing is enabled. See
92  * panfrost_job.h for notes on how this works */
93 
94 unsigned
panfrost_compute_magic_divisor(unsigned hw_divisor,unsigned * o_shift,unsigned * extra_flags)95 panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags)
96 {
97         /* We have a NPOT divisor. Here's the fun one (multipling by
98          * the inverse and shifting) */
99 
100         /* floor(log2(d)) */
101         unsigned shift = util_logbase2(hw_divisor);
102 
103         /* m = ceil(2^(32 + shift) / d) */
104         uint64_t shift_hi = 32 + shift;
105         uint64_t t = 1ll << shift_hi;
106         double t_f = t;
107         double hw_divisor_d = hw_divisor;
108         double m_f = ceil(t_f / hw_divisor_d);
109         unsigned m = m_f;
110 
111         /* Default case */
112         uint32_t magic_divisor = m;
113 
114         /* e = 2^(shift + 32) % d */
115         uint64_t e = t % hw_divisor;
116 
117         /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
118          * seems to use a different condition */
119         if (e <= (1ll << shift)) {
120                 magic_divisor = m - 1;
121                 *extra_flags = 1;
122         }
123 
124         /* Top flag implicitly set */
125         assert(magic_divisor & (1u << 31));
126         magic_divisor &= ~(1u << 31);
127         *o_shift = shift;
128 
129         return magic_divisor;
130 }
131 
132 /* Records for gl_VertexID and gl_InstanceID use a slightly special encoding,
133  * but the idea is the same */
134 
135 void
panfrost_vertex_id(unsigned padded_count,struct mali_attribute_buffer_packed * attr,bool instanced)136 panfrost_vertex_id(
137         unsigned padded_count,
138         struct mali_attribute_buffer_packed *attr,
139         bool instanced)
140 {
141         /* We factor the padded count as shift/odd and that's it */
142         pan_pack(attr, ATTRIBUTE_BUFFER, cfg) {
143                 cfg.special = MALI_ATTRIBUTE_SPECIAL_VERTEX_ID;
144                 cfg.type = 0;
145 
146                 if (instanced) {
147                         cfg.divisor_r = __builtin_ctz(padded_count);
148                         cfg.divisor_p = padded_count >> (cfg.divisor_r + 1);
149                 } else {
150                         /* Match the blob... */
151                         cfg.divisor_r = 0x1F;
152                         cfg.divisor_p = 0x4;
153                 }
154         }
155 }
156 
157 void
panfrost_instance_id(unsigned padded_count,struct mali_attribute_buffer_packed * attr,bool instanced)158 panfrost_instance_id(
159         unsigned padded_count,
160         struct mali_attribute_buffer_packed *attr,
161         bool instanced)
162 {
163         pan_pack(attr, ATTRIBUTE_BUFFER, cfg) {
164                 cfg.special = MALI_ATTRIBUTE_SPECIAL_INSTANCE_ID;
165                 cfg.type = 0;
166 
167                 /* POT records have just a shift directly with an off-by-one for
168                  * unclear reasons. NPOT records have a magic divisor smushed into the
169                  * stride field (which is unused for these special records) */
170 
171                 if (!instanced || padded_count <= 1) {
172                         /* Match the blob... */
173                         cfg.stride = ((1u << 31) - 1);
174                         cfg.divisor_r = 0x1F;
175                         cfg.divisor_e = 0x1;
176                 } else if(util_is_power_of_two_or_zero(padded_count)) {
177                         /* By above, padded_count > 1 => padded_count >= 2 so
178                          * since we're a power of two, ctz(padded_count) =
179                          * log2(padded_count) >= log2(2) = 1, so
180                          * ctz(padded_count) - 1 >= 0, so this can't underflow
181                          * */
182 
183                         cfg.divisor_r = __builtin_ctz(padded_count) - 1;
184                 } else {
185                         cfg.stride = panfrost_compute_magic_divisor(padded_count,
186                                         &cfg.divisor_r, &cfg.divisor_e);
187                 }
188         }
189 }
190 
191