1 /*
2  * Copyright © 2017 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file vc4_tiling_lt.c
25  *
26  * Helper functions from vc4_tiling.c that will be compiled for using NEON
27  * assembly or not.
28  *
29  * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon.
30  * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
31  * sim build working.
32  */
33 
34 #include <string.h>
35 #include "pipe/p_state.h"
36 #include "vc4_tiling.h"
37 
38 #ifdef VC4_BUILD_NEON
39 #define NEON_TAG(x) x ## _neon
40 #else
41 #define NEON_TAG(x) x ## _base
42 #endif
43 
44 /** Returns the stride in bytes of a 64-byte microtile. */
45 static uint32_t
vc4_utile_stride(int cpp)46 vc4_utile_stride(int cpp)
47 {
48         switch (cpp) {
49         case 1:
50                 return 8;
51         case 2:
52         case 4:
53         case 8:
54                 return 16;
55         default:
56                 unreachable("bad cpp");
57         }
58 }
59 
60 static void
vc4_load_utile(void * cpu,void * gpu,uint32_t cpu_stride,uint32_t cpp)61 vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
62 {
63         uint32_t gpu_stride = vc4_utile_stride(cpp);
64 #if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
65         if (gpu_stride == 8) {
66                 __asm__ volatile (
67                         /* Load from the GPU in one shot, no interleave, to
68                          * d0-d7.
69                          */
70                         "vldm %0, {q0, q1, q2, q3}\n"
71                         /* Store each 8-byte line to cpu-side destination,
72                          * incrementing it by the stride each time.
73                          */
74                         "vst1.8 d0, [%1], %2\n"
75                         "vst1.8 d1, [%1], %2\n"
76                         "vst1.8 d2, [%1], %2\n"
77                         "vst1.8 d3, [%1], %2\n"
78                         "vst1.8 d4, [%1], %2\n"
79                         "vst1.8 d5, [%1], %2\n"
80                         "vst1.8 d6, [%1], %2\n"
81                         "vst1.8 d7, [%1]\n"
82                         :
83                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
84                         : "q0", "q1", "q2", "q3");
85         } else {
86                 assert(gpu_stride == 16);
87                 __asm__ volatile (
88                         /* Load from the GPU in one shot, no interleave, to
89                          * d0-d7.
90                          */
91                         "vldm %0, {q0, q1, q2, q3};\n"
92                         /* Store each 16-byte line in 2 parts to the cpu-side
93                          * destination.  (vld1 can only store one d-register
94                          * at a time).
95                          */
96                         "vst1.8 d0, [%1], %3\n"
97                         "vst1.8 d1, [%2], %3\n"
98                         "vst1.8 d2, [%1], %3\n"
99                         "vst1.8 d3, [%2], %3\n"
100                         "vst1.8 d4, [%1], %3\n"
101                         "vst1.8 d5, [%2], %3\n"
102                         "vst1.8 d6, [%1]\n"
103                         "vst1.8 d7, [%2]\n"
104                         :
105                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
106                         : "q0", "q1", "q2", "q3");
107         }
108 #elif defined (PIPE_ARCH_AARCH64)
109 	if (gpu_stride == 8) {
110                 __asm__ volatile (
111                         /* Load from the GPU in one shot, no interleave, to
112                          * d0-d7.
113                          */
114                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
115                         /* Store each 8-byte line to cpu-side destination,
116                          * incrementing it by the stride each time.
117                          */
118                         "st1 {v0.D}[0], [%1], %2\n"
119                         "st1 {v0.D}[1], [%1], %2\n"
120                         "st1 {v1.D}[0], [%1], %2\n"
121                         "st1 {v1.D}[1], [%1], %2\n"
122                         "st1 {v2.D}[0], [%1], %2\n"
123                         "st1 {v2.D}[1], [%1], %2\n"
124                         "st1 {v3.D}[0], [%1], %2\n"
125                         "st1 {v3.D}[1], [%1]\n"
126 			:
127                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
128                         : "v0", "v1", "v2", "v3");
129         } else {
130                 assert(gpu_stride == 16);
131                 __asm__ volatile (
132                         /* Load from the GPU in one shot, no interleave, to
133                          * d0-d7.
134                          */
135                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
136                         /* Store each 16-byte line in 2 parts to the cpu-side
137                          * destination.  (vld1 can only store one d-register
138                          * at a time).
139                          */
140                         "st1 {v0.D}[0], [%1], %3\n"
141                         "st1 {v0.D}[1], [%2], %3\n"
142                         "st1 {v1.D}[0], [%1], %3\n"
143                         "st1 {v1.D}[1], [%2], %3\n"
144                         "st1 {v2.D}[0], [%1], %3\n"
145                         "st1 {v2.D}[1], [%2], %3\n"
146                         "st1 {v3.D}[0], [%1]\n"
147                         "st1 {v3.D}[1], [%2]\n"
148                         :
149                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
150                         : "v0", "v1", "v2", "v3");
151         }
152 #else
153         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
154                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
155                 cpu += cpu_stride;
156         }
157 #endif
158 }
159 
160 static void
vc4_store_utile(void * gpu,void * cpu,uint32_t cpu_stride,uint32_t cpp)161 vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
162 {
163         uint32_t gpu_stride = vc4_utile_stride(cpp);
164 
165 #if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
166         if (gpu_stride == 8) {
167                 __asm__ volatile (
168                         /* Load each 8-byte line from cpu-side source,
169                          * incrementing it by the stride each time.
170                          */
171                         "vld1.8 d0, [%1], %2\n"
172                         "vld1.8 d1, [%1], %2\n"
173                         "vld1.8 d2, [%1], %2\n"
174                         "vld1.8 d3, [%1], %2\n"
175                         "vld1.8 d4, [%1], %2\n"
176                         "vld1.8 d5, [%1], %2\n"
177                         "vld1.8 d6, [%1], %2\n"
178                         "vld1.8 d7, [%1]\n"
179                         /* Load from the GPU in one shot, no interleave, to
180                          * d0-d7.
181                          */
182                         "vstm %0, {q0, q1, q2, q3}\n"
183                         :
184                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
185                         : "q0", "q1", "q2", "q3");
186         } else {
187                 assert(gpu_stride == 16);
188                 __asm__ volatile (
189                         /* Load each 16-byte line in 2 parts from the cpu-side
190                          * destination.  (vld1 can only store one d-register
191                          * at a time).
192                          */
193                         "vld1.8 d0, [%1], %3\n"
194                         "vld1.8 d1, [%2], %3\n"
195                         "vld1.8 d2, [%1], %3\n"
196                         "vld1.8 d3, [%2], %3\n"
197                         "vld1.8 d4, [%1], %3\n"
198                         "vld1.8 d5, [%2], %3\n"
199                         "vld1.8 d6, [%1]\n"
200                         "vld1.8 d7, [%2]\n"
201                         /* Store to the GPU in one shot, no interleave. */
202                         "vstm %0, {q0, q1, q2, q3}\n"
203                         :
204                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
205                         : "q0", "q1", "q2", "q3");
206         }
207 #elif defined (PIPE_ARCH_AARCH64)
208 	if (gpu_stride == 8) {
209                 __asm__ volatile (
210                         /* Load each 8-byte line from cpu-side source,
211                          * incrementing it by the stride each time.
212                          */
213                         "ld1 {v0.D}[0], [%1], %2\n"
214                         "ld1 {v0.D}[1], [%1], %2\n"
215                         "ld1 {v1.D}[0], [%1], %2\n"
216                         "ld1 {v1.D}[1], [%1], %2\n"
217                         "ld1 {v2.D}[0], [%1], %2\n"
218                         "ld1 {v2.D}[1], [%1], %2\n"
219                         "ld1 {v3.D}[0], [%1], %2\n"
220                         "ld1 {v3.D}[1], [%1]\n"
221                         /* Store to the GPU in one shot, no interleave. */
222                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
223                         :
224                         : "r"(gpu), "r"(cpu), "r"(cpu_stride)
225                         : "v0", "v1", "v2", "v3");
226         } else {
227                 assert(gpu_stride == 16);
228                 __asm__ volatile (
229                         /* Load each 16-byte line in 2 parts from the cpu-side
230                          * destination.  (vld1 can only store one d-register
231                          * at a time).
232                          */
233                         "ld1 {v0.D}[0], [%1], %3\n"
234                         "ld1 {v0.D}[1], [%2], %3\n"
235                         "ld1 {v1.D}[0], [%1], %3\n"
236                         "ld1 {v1.D}[1], [%2], %3\n"
237                         "ld1 {v2.D}[0], [%1], %3\n"
238                         "ld1 {v2.D}[1], [%2], %3\n"
239                         "ld1 {v3.D}[0], [%1]\n"
240                         "ld1 {v3.D}[1], [%2]\n"
241                         /* Store to the GPU in one shot, no interleave. */
242                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
243                         :
244                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
245                         : "v0", "v1", "v2", "v3");
246         }
247 #else
248         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
249                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
250                 cpu += cpu_stride;
251         }
252 #endif
253 
254 }
255 
256 void
NEON_TAG(vc4_load_lt_image)257 NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
258                             void *src, uint32_t src_stride,
259                             int cpp, const struct pipe_box *box)
260 {
261         uint32_t utile_w = vc4_utile_width(cpp);
262         uint32_t utile_h = vc4_utile_height(cpp);
263         uint32_t xstart = box->x;
264         uint32_t ystart = box->y;
265 
266         for (uint32_t y = 0; y < box->height; y += utile_h) {
267                 for (int x = 0; x < box->width; x += utile_w) {
268                         vc4_load_utile(dst + (dst_stride * y +
269                                               x * cpp),
270                                        src + ((ystart + y) * src_stride +
271                                               (xstart + x) * 64 / utile_w),
272                                        dst_stride, cpp);
273                 }
274         }
275 }
276 
277 void
NEON_TAG(vc4_store_lt_image)278 NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
279                              void *src, uint32_t src_stride,
280                              int cpp, const struct pipe_box *box)
281 {
282         uint32_t utile_w = vc4_utile_width(cpp);
283         uint32_t utile_h = vc4_utile_height(cpp);
284         uint32_t xstart = box->x;
285         uint32_t ystart = box->y;
286 
287         for (uint32_t y = 0; y < box->height; y += utile_h) {
288                 for (int x = 0; x < box->width; x += utile_w) {
289                         vc4_store_utile(dst + ((ystart + y) * dst_stride +
290                                                (xstart + x) * 64 / utile_w),
291                                         src + (src_stride * y +
292                                                x * cpp),
293                                         src_stride, cpp);
294                 }
295         }
296 }
297