1 /*
2  * Copyright © 2017 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file v3d_cpu_tiling.h
25  *
26  * Contains load/store functions common to both v3d and vc4.  The utile layout
27  * stayed the same, though the way utiles get laid out has changed.
28  */
29 
30 static inline void
v3d_load_utile(void * cpu,uint32_t cpu_stride,void * gpu,uint32_t gpu_stride)31 v3d_load_utile(void *cpu, uint32_t cpu_stride,
32                void *gpu, uint32_t gpu_stride)
33 {
34 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35         if (gpu_stride == 8) {
36                 __asm__ volatile (
37                         /* Load from the GPU in one shot, no interleave, to
38                          * d0-d7.
39                          */
40                         "vldm %[gpu], {q0, q1, q2, q3}\n"
41                         /* Store each 8-byte line to cpu-side destination,
42                          * incrementing it by the stride each time.
43                          */
44                         "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45                         "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46                         "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47                         "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48                         "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49                         "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50                         "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51                         "vst1.8 d7, [%[cpu]]\n"
52                         : [cpu]         "+r"(cpu)
53                         : [gpu]         "r"(gpu),
54                           [cpu_stride]  "r"(cpu_stride)
55                         : "q0", "q1", "q2", "q3");
56                 return;
57         } else if (gpu_stride == 16) {
58                 void *cpu2 = cpu + 8;
59                 __asm__ volatile (
60                         /* Load from the GPU in one shot, no interleave, to
61                          * d0-d7.
62                          */
63                         "vldm %[gpu], {q0, q1, q2, q3};\n"
64                         /* Store each 16-byte line in 2 parts to the cpu-side
65                          * destination.  (vld1 can only store one d-register
66                          * at a time).
67                          */
68                         "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69                         "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70                         "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71                         "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72                         "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73                         "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74                         "vst1.8 d6, [%[cpu]]\n"
75                         "vst1.8 d7, [%[cpu2]]\n"
76                         : [cpu]         "+r"(cpu),
77                           [cpu2]        "+r"(cpu2)
78                         : [gpu]         "r"(gpu),
79                           [cpu_stride]  "r"(cpu_stride)
80                         : "q0", "q1", "q2", "q3");
81                 return;
82         }
83 #elif defined (PIPE_ARCH_AARCH64)
84         if (gpu_stride == 8) {
85                 __asm__ volatile (
86                         /* Load from the GPU in one shot, no interleave, to
87                          * d0-d7.
88                          */
89                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
90                         /* Store each 8-byte line to cpu-side destination,
91                          * incrementing it by the stride each time.
92                          */
93                         "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
94                         "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
95                         "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
96                         "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
97                         "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
98                         "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
99                         "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
100                         "st1 {v3.D}[1], [%[cpu]]\n"
101                         : [cpu]         "+r"(cpu)
102                         : [gpu]         "r"(gpu),
103                           [cpu_stride]  "r"(cpu_stride)
104                         : "v0", "v1", "v2", "v3");
105                 return;
106         } else if (gpu_stride == 16) {
107                 void *cpu2 = cpu + 8;
108                 __asm__ volatile (
109                         /* Load from the GPU in one shot, no interleave, to
110                          * d0-d7.
111                          */
112                         "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
113                         /* Store each 16-byte line in 2 parts to the cpu-side
114                          * destination.  (vld1 can only store one d-register
115                          * at a time).
116                          */
117                         "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
118                         "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
119                         "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
120                         "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
121                         "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
122                         "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
123                         "st1 {v3.D}[0], [%[cpu]]\n"
124                         "st1 {v3.D}[1], [%[cpu2]]\n"
125                         : [cpu]         "+r"(cpu),
126                           [cpu2]        "+r"(cpu2)
127                         : [gpu]         "r"(gpu),
128                           [cpu_stride]  "r"(cpu_stride)
129                         : "v0", "v1", "v2", "v3");
130                 return;
131         }
132 #endif
133 
134         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
135                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
136                 cpu += cpu_stride;
137         }
138 }
139 
140 static inline void
v3d_store_utile(void * gpu,uint32_t gpu_stride,void * cpu,uint32_t cpu_stride)141 v3d_store_utile(void *gpu, uint32_t gpu_stride,
142                 void *cpu, uint32_t cpu_stride)
143 {
144 #if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
145         if (gpu_stride == 8) {
146                 __asm__ volatile (
147                         /* Load each 8-byte line from cpu-side source,
148                          * incrementing it by the stride each time.
149                          */
150                         "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
151                         "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
152                         "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
153                         "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
154                         "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
155                         "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
156                         "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
157                         "vld1.8 d7, [%[cpu]]\n"
158                         /* Load from the GPU in one shot, no interleave, to
159                          * d0-d7.
160                          */
161                         "vstm %[gpu], {q0, q1, q2, q3}\n"
162                         : [cpu]         "+r"(cpu)
163                         : [gpu]         "r"(gpu),
164                           [cpu_stride]  "r"(cpu_stride)
165                         : "q0", "q1", "q2", "q3");
166                 return;
167         } else if (gpu_stride == 16) {
168                 void *cpu2 = cpu + 8;
169                 __asm__ volatile (
170                         /* Load each 16-byte line in 2 parts from the cpu-side
171                          * destination.  (vld1 can only store one d-register
172                          * at a time).
173                          */
174                         "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
175                         "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
176                         "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
177                         "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
178                         "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
179                         "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
180                         "vld1.8 d6, [%[cpu]]\n"
181                         "vld1.8 d7, [%[cpu2]]\n"
182                         /* Store to the GPU in one shot, no interleave. */
183                         "vstm %[gpu], {q0, q1, q2, q3}\n"
184                         : [cpu]         "+r"(cpu),
185                           [cpu2]        "+r"(cpu2)
186                         : [gpu]         "r"(gpu),
187                           [cpu_stride]  "r"(cpu_stride)
188                         : "q0", "q1", "q2", "q3");
189                 return;
190         }
191 #elif defined (PIPE_ARCH_AARCH64)
192         if (gpu_stride == 8) {
193                 __asm__ volatile (
194                         /* Load each 8-byte line from cpu-side source,
195                          * incrementing it by the stride each time.
196                          */
197                         "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
198                         "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
199                         "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
200                         "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
201                         "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
202                         "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
203                         "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
204                         "ld1 {v3.D}[1], [%[cpu]]\n"
205                         /* Store to the GPU in one shot, no interleave. */
206                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
207                         : [cpu]         "+r"(cpu)
208                         : [gpu]         "r"(gpu),
209                           [cpu_stride]  "r"(cpu_stride)
210                         : "v0", "v1", "v2", "v3");
211                 return;
212         } else if (gpu_stride == 16) {
213                 void *cpu2 = cpu + 8;
214                 __asm__ volatile (
215                         /* Load each 16-byte line in 2 parts from the cpu-side
216                          * destination.  (vld1 can only store one d-register
217                          * at a time).
218                          */
219                         "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
220                         "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
221                         "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
222                         "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
223                         "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
224                         "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
225                         "ld1 {v3.D}[0], [%[cpu]]\n"
226                         "ld1 {v3.D}[1], [%[cpu2]]\n"
227                         /* Store to the GPU in one shot, no interleave. */
228                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
229                         : [cpu]         "+r"(cpu),
230                           [cpu2]        "+r"(cpu2)
231                         : [gpu]         "r"(gpu),
232                           [cpu_stride]  "r"(cpu_stride)
233                         : "v0", "v1", "v2", "v3");
234                 return;
235         }
236 #endif
237 
238         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
239                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
240                 cpu += cpu_stride;
241         }
242 }
243