1 /*
2 
3 Copyright (c) 2013 STMicroelectronics
4 Written by Christophe Lyon
5 
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12 
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15 
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 
24 */
25 
26 #if defined(__arm__) || defined(__aarch64__)
27 #include <arm_neon.h>
28 #else
29 #include "stm-arm-neon.h"
30 #endif
31 #include "stm-arm-neon-ref.h"
32 
33 /* Initialization helpers; 4 slices are needed for vld2, vld3 and
34    vld4.  */
35 #define MY_INIT_TAB(T,W,N) xNAME(INIT_TAB,N)(T##W##_t)
36 #define MY_INIT_TAB2(T,W,N) xNAME(INIT_TAB2,N)(T##W##_t)
37 #define MY_INIT_TAB3(T,W,N) xNAME(INIT_TAB3,N)(T##W##_t)
38 #define MY_INIT_TAB4(T,W,N) xNAME(INIT_TAB4,N)(T##W##_t)
39 
40 /* Initialized input buffers.  */
41 #define VECT_VAR_DECL_INIT(V, T, W, N)			\
42   VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,N) };
43 
44 /* Specialized initializer with 4 entries, as used by vldX_dup and
45    vdup tests, which iterate 4 times on input buffers.  */
46 #define VECT_VAR_DECL_INIT4(V, T, W, N)			\
47   VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,4) };
48 
49 /* Initializers for arrays of vectors.  */
50 #define VECT_ARRAY_INIT2(V, T, W, N)		\
51   T##W##_t VECT_ARRAY_VAR(V,T,W,N,2)[] =	\
52   { MY_INIT_TAB(T,W,N)				\
53     MY_INIT_TAB2(T,W,N) };
54 
55 #define VECT_ARRAY_INIT3(V, T, W, N)					\
56   T##W##_t VECT_ARRAY_VAR(V,T,W,N,3)[] =				\
57   { MY_INIT_TAB(T,W,N)							\
58     MY_INIT_TAB2(T,W,N)							\
59     MY_INIT_TAB3(T,W,N) };
60 
61 #define VECT_ARRAY_INIT4(V, T, W, N)					\
62   T##W##_t VECT_ARRAY_VAR(V,T,W,N,4)[] =				\
63   { MY_INIT_TAB(T,W,N)							\
64     MY_INIT_TAB2(T,W,N)							\
65     MY_INIT_TAB3(T,W,N)							\
66     MY_INIT_TAB4(T,W,N) };
67 
68 /* Sample initialization vectors.  */
69 #define INIT_TAB_1(T)				\
70   (T)-16,
71 #define INIT_TAB2_1(T)				\
72   (T)-15,
73 #define INIT_TAB3_1(T)				\
74   (T)-14,
75 #define INIT_TAB4_1(T)				\
76   (T)-13,
77 
78 #define INIT_TAB_2(T)				\
79   (T)-16, (T)-15,
80 #define INIT_TAB2_2(T)				\
81   (T)-14, (T)-13,
82 #define INIT_TAB3_2(T)				\
83   (T)-12, (T)-11,
84 #define INIT_TAB4_2(T)				\
85   (T)-10, (T)-9,
86 
87 /* Initializer for vld3_lane tests.  */
88 #define INIT_TAB_3(T)				\
89   (T)-16, (T)-15, (T)-14,
90 
91 #define INIT_TAB_4(T)				\
92   (T)-16, (T)-15, (T)-14, (T)-13,
93 #define INIT_TAB2_4(T)				\
94   (T)-12, (T)-11, (T)-10, (T)-9,
95 #define INIT_TAB3_4(T)				\
96   (T)-8, (T)-7, (T)-6, (T)-5,
97 #define INIT_TAB4_4(T)				\
98   (T)-4, (T)-3, (T)-2, (T)-1,
99 
100 #define INIT_TAB_8(T)							\
101   (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9,
102 #define INIT_TAB2_8(T)							\
103   (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1,
104 #define INIT_TAB3_8(T)							\
105   (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7,
106 #define INIT_TAB4_8(T)							\
107   (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15,
108 
109 #define INIT_TAB_16(T)							\
110   (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9,	\
111   (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1,
112 #define INIT_TAB2_16(T)							\
113   (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7,			\
114   (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15,
115 #define INIT_TAB3_16(T)							\
116   (T)16, (T)17, (T)18, (T)19, (T)20, (T)21, (T)22, (T)23,		\
117    (T)24, (T)25, (T)26, (T)27, (T)28, (T)29, (T)30, (T)31,
118 #define INIT_TAB4_16(T)							\
119   (T)32, (T)33, (T)34, (T)35, (T)36, (T)37, (T)38, (T)39,		\
120   (T)40, (T)41, (T)42, (T)43, (T)44, (T)45, (T)46, (T)47,
121 
122 /* Input buffers, one of each size.  */
123 /* Insert some padding to try to exhibit out of bounds accesses.  */
124 VECT_VAR_DECL_INIT(buffer, int, 8, 8);
125 PAD(buffer_pad, int, 8, 8);
126 VECT_VAR_DECL_INIT(buffer, int, 16, 4);
127 PAD(buffer_pad, int, 16, 4);
128 VECT_VAR_DECL_INIT(buffer, int, 32, 2);
129 PAD(buffer_pad, int, 32, 2);
130 VECT_VAR_DECL_INIT(buffer, int, 64, 1);
131 PAD(buffer_pad, int, 64, 1);
132 VECT_VAR_DECL_INIT(buffer, uint, 8, 8);
133 PAD(buffer_pad, uint, 8, 8);
134 VECT_VAR_DECL_INIT(buffer, poly, 8, 8);
135 PAD(buffer_pad, poly, 8, 8);
136 VECT_VAR_DECL_INIT(buffer, poly, 16, 4);
137 PAD(buffer_pad, poly, 16, 4);
138 VECT_VAR_DECL_INIT(buffer, uint, 16, 4);
139 PAD(buffer_pad, uint, 16, 4);
140 VECT_VAR_DECL_INIT(buffer, uint, 32, 2);
141 PAD(buffer_pad, uint, 32, 2);
142 VECT_VAR_DECL_INIT(buffer, uint, 64, 1);
143 PAD(buffer_pad, uint, 64, 1);
144 VECT_VAR_DECL_INIT(buffer, float, 32, 2);
145 PAD(buffer_pad, float, 32, 2);
146 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
147 /* We need a different initialization for ARMCC, because the compiler
148    performs the conversion to half-precision internal
149    representation.  */
150 #ifdef __ARMCC_VERSION
151 __fp16 buffer_float16x4[4] = {-16, -15, -14, -13};
152 #else
153 VECT_VAR_DECL(buffer, float, 16, 4) [] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
154 					  0xcb00 /* -14 */, 0xca80 /* -13 */};
155 #endif
156 PAD(buffer_pad, float, 16, 4);
157 #endif
158 VECT_VAR_DECL_INIT(buffer, int, 8, 16);
159 PAD(buffer_pad, int, 8, 16);
160 VECT_VAR_DECL_INIT(buffer, int, 16, 8);
161 PAD(buffer_pad, int, 16, 8);
162 VECT_VAR_DECL_INIT(buffer, int, 32, 4);
163 PAD(buffer_pad, int, 32, 4);
164 VECT_VAR_DECL_INIT(buffer, int, 64, 2);
165 PAD(buffer_pad, int, 64, 2);
166 VECT_VAR_DECL_INIT(buffer, uint, 8, 16);
167 PAD(buffer_pad, uint, 8, 16);
168 VECT_VAR_DECL_INIT(buffer, uint, 16, 8);
169 PAD(buffer_pad, uint, 16, 8);
170 VECT_VAR_DECL_INIT(buffer, uint, 32, 4);
171 PAD(buffer_pad, uint, 32, 4);
172 VECT_VAR_DECL_INIT(buffer, uint, 64, 2);
173 PAD(buffer_pad, uint, 64, 2);
174 VECT_VAR_DECL_INIT(buffer, poly, 8, 16);
175 PAD(buffer_pad, poly, 8, 16);
176 VECT_VAR_DECL_INIT(buffer, poly, 16, 8);
177 PAD(buffer_pad, poly, 16, 8);
178 VECT_VAR_DECL_INIT(buffer, float, 32, 4);
179 PAD(buffer_pad, float, 32, 4);
180 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
181 #ifdef __ARMCC_VERSION
182 __fp16 buffer_float16x8[8] = {-16, -15, -14, -13, -12, -11, -10, -9};
183 #else
184 VECT_VAR_DECL(buffer, float, 16, 8) [] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
185 					  0xcb00 /* -14 */, 0xca80 /* -13 */,
186 					  0xca00 /* -12 */, 0xc980 /* -11 */,
187 					  0xc900 /* -10 */, 0xc880 /* -9 */};
188 #endif
189 PAD(buffer_pad, float, 16, 8);
190 #endif
191 
192 /* The tests for vld1_dup and vdup expect at least 4 entries in the
193    input buffer, so force 1- and 2-elements initializers to have 4
194    entries.  */
195 VECT_VAR_DECL_INIT(buffer_dup, int, 8, 8);
196 VECT_VAR_DECL(buffer_dup_pad, int, 8, 8);
197 VECT_VAR_DECL_INIT(buffer_dup, int, 16, 4);
198 VECT_VAR_DECL(buffer_dup_pad, int, 16, 4);
199 VECT_VAR_DECL_INIT4(buffer_dup, int, 32, 2);
200 VECT_VAR_DECL(buffer_dup_pad, int, 32, 2);
201 VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 1);
202 VECT_VAR_DECL(buffer_dup_pad, int, 64, 1);
203 VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 8);
204 VECT_VAR_DECL(buffer_dup_pad, uint, 8, 8);
205 VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 4);
206 VECT_VAR_DECL(buffer_dup_pad, uint, 16, 4);
207 VECT_VAR_DECL_INIT4(buffer_dup, uint, 32, 2);
208 VECT_VAR_DECL(buffer_dup_pad, uint, 32, 2);
209 VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 1);
210 VECT_VAR_DECL(buffer_dup_pad, uint, 64, 1);
211 VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8);
212 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8);
213 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4);
214 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
215 VECT_VAR_DECL_INIT4(buffer_dup, float, 32, 2);
216 VECT_VAR_DECL(buffer_dup_pad, float, 32, 2);
217 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
218 #ifdef __ARMCC_VERSION
219 __fp16 buffer_dup_float16x4[4] = {-16, -15, -14, -13};
220 #else
221 VECT_VAR_DECL(buffer_dup, float, 16, 4)[] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
222 					     0xcb00 /* -14 */, 0xca80 /* -13 */};
223 #endif
224 PAD(buffer_dup_pad, float, 16, 4);
225 #endif
226 VECT_VAR_DECL_INIT(buffer_dup, int, 8, 16);
227 VECT_VAR_DECL(buffer_dup_pad, int, 8, 16);
228 VECT_VAR_DECL_INIT(buffer_dup, int, 16, 8);
229 VECT_VAR_DECL(buffer_dup_pad, int, 16, 8);
230 VECT_VAR_DECL_INIT(buffer_dup, int, 32, 4);
231 VECT_VAR_DECL(buffer_dup_pad, int, 32, 4);
232 VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 2);
233 VECT_VAR_DECL(buffer_dup_pad, int, 64, 2);
234 VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 16);
235 VECT_VAR_DECL(buffer_dup_pad, uint, 8, 16);
236 VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 8);
237 VECT_VAR_DECL(buffer_dup_pad, uint, 16, 8);
238 VECT_VAR_DECL_INIT(buffer_dup, uint, 32, 4);
239 VECT_VAR_DECL(buffer_dup_pad, uint, 32, 4);
240 VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 2);
241 VECT_VAR_DECL(buffer_dup_pad, uint, 64, 2);
242 VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16);
243 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16);
244 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8);
245 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
246 VECT_VAR_DECL_INIT(buffer_dup, float, 32, 4);
247 VECT_VAR_DECL(buffer_dup_pad, float, 32, 4);
248 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
249 #ifdef __ARMCC_VERSION
250 __fp16 buffer_dup_float16x8[8] = {-16, -15, -14, -13, -12, -11, -10, -9};
251 #else
252 VECT_VAR_DECL(buffer_dup, float, 16, 8)[] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
253 					     0xcb00 /* -14 */, 0xca80 /* -13 */,
254 					     0xca00 /* -12 */, 0xc980 /* -11 */,
255 					     0xc900 /* -10 */, 0xc880 /* -9 */};
256 #endif
257 PAD(buffer_dup_pad, float, 16, 8);
258 #endif
259 
260 /* Input buffers for vld2, 1 of each size */
261 VECT_ARRAY_INIT2(buffer_vld2, int, 8, 8);
262 PAD(buffer_vld2_pad, int, 8, 8);
263 VECT_ARRAY_INIT2(buffer_vld2, int, 16, 4);
264 PAD(buffer_vld2_pad, int, 16, 4);
265 VECT_ARRAY_INIT2(buffer_vld2, int, 32, 2);
266 PAD(buffer_vld2_pad, int, 32, 2);
267 VECT_ARRAY_INIT2(buffer_vld2, int, 64, 1);
268 PAD(buffer_vld2_pad, int, 64, 1);
269 VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 8);
270 PAD(buffer_vld2_pad, uint, 8, 8);
271 VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 4);
272 PAD(buffer_vld2_pad, uint, 16, 4);
273 VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 2);
274 PAD(buffer_vld2_pad, uint, 32, 2);
275 VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 1);
276 PAD(buffer_vld2_pad, uint, 64, 1);
277 VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 8);
278 PAD(buffer_vld2_pad, poly, 8, 8);
279 VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
280 PAD(buffer_vld2_pad, poly, 16, 4);
281 VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
282 PAD(buffer_vld2_pad, float, 32, 2);
283 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
284 #ifdef __ARMCC_VERSION
285 __fp16 buffer_vld2_float16x4x2[4*2] = {-16, -15, -14, -13, -12, -11, -10, -9};
286 #else
287 float16_t buffer_vld2_float16x4x2[4*2] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
288 					  0xcb00 /* -14 */, 0xca80 /* -13 */,
289 					  0xca00 /* -12 */, 0xc980 /* -11 */,
290 					  0xc900 /* -10 */, 0xc880 /* -9 */};
291 #endif
292 PAD(buffer_vld2_pad, float, 16, 4);
293 #endif
294 VECT_ARRAY_INIT2(buffer_vld2, int, 8, 16);
295 PAD(buffer_vld2_pad, int, 8, 16);
296 VECT_ARRAY_INIT2(buffer_vld2, int, 16, 8);
297 PAD(buffer_vld2_pad, int, 16, 8);
298 VECT_ARRAY_INIT2(buffer_vld2, int, 32, 4);
299 PAD(buffer_vld2_pad, int, 32, 4);
300 VECT_ARRAY_INIT2(buffer_vld2, int, 64, 2);
301 PAD(buffer_vld2_pad, int, 64, 2);
302 VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 16);
303 PAD(buffer_vld2_pad, uint, 8, 16);
304 VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 8);
305 PAD(buffer_vld2_pad, uint, 16, 8);
306 VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 4);
307 PAD(buffer_vld2_pad, uint, 32, 4);
308 VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 2);
309 PAD(buffer_vld2_pad, uint, 64, 2);
310 VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 16);
311 PAD(buffer_vld2_pad, poly, 8, 16);
312 VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
313 PAD(buffer_vld2_pad, poly, 16, 8);
314 VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
315 PAD(buffer_vld2_pad, float, 32, 4);
316 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
317 #ifdef __ARMCC_VERSION
318 __fp16 buffer_vld2_float16x8x2[8*2] = {-16, -15, -14, -13, -12, -11, -10, -9,
319 				       -8, -7, -6, -5, -4, -3, -2, -1};
320 #else
321 float16_t buffer_vld2_float16x8x2[8*2] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
322 					  0xcb00 /* -14 */, 0xca80 /* -13 */,
323 					  0xca00 /* -12 */, 0xc980 /* -11 */,
324 					  0xc900 /* -10 */, 0xc880 /* -9 */,
325 					  0xc800 /* -8 */, 0xc700 /* -7 */,
326 					  0xc600 /* -6 */, 0xc500 /* -5 */,
327 					  0xc400 /* -4 */, 0xc200 /* -3 */,
328 					  0xc000 /* -2 */, 0xbc00 /* -1 */};
329 #endif
330 PAD(buffer_vld2_pad, float, 16, 8);
331 #endif
332 
333 /* Input buffers for vld3, 1 of each size */
334 VECT_ARRAY_INIT3(buffer_vld3, int, 8, 8);
335 PAD(buffer_vld3_pad, int, 8, 8);
336 VECT_ARRAY_INIT3(buffer_vld3, int, 16, 4);
337 PAD(buffer_vld3_pad, int, 16, 4);
338 VECT_ARRAY_INIT3(buffer_vld3, int, 32, 2);
339 PAD(buffer_vld3_pad, int, 32, 2);
340 VECT_ARRAY_INIT3(buffer_vld3, int, 64, 1);
341 PAD(buffer_vld3_pad, int, 64, 1);
342 VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 8);
343 PAD(buffer_vld3_pad, uint, 8, 8);
344 VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 4);
345 PAD(buffer_vld3_pad, uint, 16, 4);
346 VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 2);
347 PAD(buffer_vld3_pad, uint, 32, 2);
348 VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 1);
349 PAD(buffer_vld3_pad, uint, 64, 1);
350 VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 8);
351 PAD(buffer_vld3_pad, poly, 8, 8);
352 VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
353 PAD(buffer_vld3_pad, poly, 16, 4);
354 VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
355 PAD(buffer_vld3_pad, float, 32, 2);
356 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
357 #ifdef __ARMCC_VERSION
358 __fp16 buffer_vld3_float16x4x3[4*3] = {-16, -15, -14, -13, -12, -11, -10, -9,
359 				       -8, -7, -6, -5};
360 #else
361 float16_t buffer_vld3_float16x4x3[4*3] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
362 					  0xcb00 /* -14 */, 0xca80 /* -13 */,
363 					  0xca00 /* -12 */, 0xc980 /* -11 */,
364 					  0xc900 /* -10 */, 0xc880 /* -9 */,
365 					  0xc800 /* -8 */, 0xc700 /* -7 */,
366 					  0xc600 /* -6 */, 0xc500 /* -5 */};
367 #endif
368 PAD(buffer_vld3_pad, float, 16, 4);
369 #endif
370 VECT_ARRAY_INIT3(buffer_vld3, int, 8, 16);
371 PAD(buffer_vld3_pad, int, 8, 16);
372 VECT_ARRAY_INIT3(buffer_vld3, int, 16, 8);
373 PAD(buffer_vld3_pad, int, 16, 8);
374 VECT_ARRAY_INIT3(buffer_vld3, int, 32, 4);
375 PAD(buffer_vld3_pad, int, 32, 4);
376 VECT_ARRAY_INIT3(buffer_vld3, int, 64, 2);
377 PAD(buffer_vld3_pad, int, 64, 2);
378 VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 16);
379 PAD(buffer_vld3_pad, uint, 8, 16);
380 VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 8);
381 PAD(buffer_vld3_pad, uint, 16, 8);
382 VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 4);
383 PAD(buffer_vld3_pad, uint, 32, 4);
384 VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 2);
385 PAD(buffer_vld3_pad, uint, 64, 2);
386 VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 16);
387 PAD(buffer_vld3_pad, poly, 8, 16);
388 VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
389 PAD(buffer_vld3_pad, poly, 16, 8);
390 VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
391 PAD(buffer_vld3_pad, float, 32, 4);
392 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
393 #ifdef __ARMCC_VERSION
394 __fp16 buffer_vld3_float16x8x3[8*3] = {-16, -15, -14, -13, -12, -11, -10, -9,
395 				       -8, -7, -6, -5, -4, -3, -2, -1,
396 				       0, 1, 2, 3, 4, 5, 6, 7};
397 #else
398 float16_t buffer_vld3_float16x8x3[8*3] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
399 					  0xcb00 /* -14 */, 0xca80 /* -13 */,
400 					  0xca00 /* -12 */, 0xc980 /* -11 */,
401 					  0xc900 /* -10 */, 0xc880 /* -9 */,
402 					  0xc800 /* -8 */, 0xc700 /* -7 */,
403 					  0xc600 /* -6 */, 0xc500 /* -6 */,
404 					  0xc400 /* -4 */, 0xc200 /* -3 */,
405 					  0xc000 /* -2 */, 0xbc00 /* -1 */,
406 					  0, 0x3c00 /* 1 */,
407 					  0x4000 /* 2 */, 0x4200 /* 3 */,
408 					  0x4400 /* 4 */, 0x4500 /* 5 */,
409 					  0x4600 /* 6 */, 0x4700 /* 7 */};
410 #endif
411 PAD(buffer_vld3_pad, float, 16, 8);
412 #endif
413 
414 /* Input buffers for vld4, 1 of each size */
415 VECT_ARRAY_INIT4(buffer_vld4, int, 8, 8);
416 PAD(buffer_vld4_pad, int, 8, 8);
417 VECT_ARRAY_INIT4(buffer_vld4, int, 16, 4);
418 PAD(buffer_vld4_pad, int, 16, 4);
419 VECT_ARRAY_INIT4(buffer_vld4, int, 32, 2);
420 PAD(buffer_vld4_pad, int, 32, 2);
421 VECT_ARRAY_INIT4(buffer_vld4, int, 64, 1);
422 PAD(buffer_vld4_pad, int, 64, 1);
423 VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 8);
424 PAD(buffer_vld4_pad, uint, 8, 8);
425 VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 4);
426 PAD(buffer_vld4_pad, uint, 16, 4);
427 VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 2);
428 PAD(buffer_vld4_pad, uint, 32, 2);
429 VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 1);
430 PAD(buffer_vld4_pad, uint, 64, 1);
431 VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 8);
432 PAD(buffer_vld4_pad, poly, 8, 8);
433 VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
434 PAD(buffer_vld4_pad, poly, 16, 4);
435 VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
436 PAD(buffer_vld4_pad, float, 32, 2);
437 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
438 #ifdef __ARMCC_VERSION
439 __fp16 buffer_vld4_float16x4x4[4*4] = {-16, -15, -14, -13, -12, -11, -10, -9,
440 				       -8, -7, -6, -5, -4, -3, -2, -1};
441 #else
442 float16_t buffer_vld4_float16x4x4[4*4] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
443 					  0xcb00 /* -14 */, 0xca80 /* -13 */,
444 					  0xca00 /* -12 */, 0xc980 /* -11 */,
445 					  0xc900 /* -10 */, 0xc880 /* -9 */,
446 					  0xc800 /* -8 */, 0xc700 /* -7 */,
447 					  0xc600 /* -6 */, 0xc500 /* -5 */,
448 					  0xc400 /* -4 */, 0xc200 /* -3 */,
449 					  0xc000 /* -2 */, 0xbc00 /* -1 */};
450 #endif
451 PAD(buffer_vld4_pad, float, 16, 4);
452 #endif
453 VECT_ARRAY_INIT4(buffer_vld4, int, 8, 16);
454 PAD(buffer_vld4_pad, int, 8, 16);
455 VECT_ARRAY_INIT4(buffer_vld4, int, 16, 8);
456 PAD(buffer_vld4_pad, int, 16, 8);
457 VECT_ARRAY_INIT4(buffer_vld4, int, 32, 4);
458 PAD(buffer_vld4_pad, int, 32, 4);
459 VECT_ARRAY_INIT4(buffer_vld4, int, 64, 2);
460 PAD(buffer_vld4_pad, int, 64, 2);
461 VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 16);
462 PAD(buffer_vld4_pad, uint, 8, 16);
463 VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 8);
464 PAD(buffer_vld4_pad, uint, 16, 8);
465 VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 4);
466 PAD(buffer_vld4_pad, uint, 32, 4);
467 VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 2);
468 PAD(buffer_vld4_pad, uint, 64, 2);
469 VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 16);
470 PAD(buffer_vld4_pad, poly, 8, 16);
471 VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
472 PAD(buffer_vld4_pad, poly, 16, 8);
473 VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
474 PAD(buffer_vld4_pad, float, 32, 4);
475 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
476 #ifdef __ARMCC_VERSION
477 __fp16 buffer_vld4_float16x8x4[8*4] = {-16, -15, -14, -13, -12, -11, -10, -9,
478 				       -8, -7, -6, -5, -4, -3, -2, -1,
479 				       0, 1, 2, 3, 4, 5, 6, 7,
480 				       8, 9, 10, 11, 12, 13, 14, 15};
481 #else
482 float16_t buffer_vld4_float16x8x4[8*4] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
483 					  0xcb00 /* -14 */, 0xca80 /* -13 */,
484 					  0xca00 /* -12 */, 0xc980 /* -11 */,
485 					  0xc900 /* -10 */, 0xc880 /* -9 */,
486 					  0xc800 /* -8 */, 0xc700 /* -7 */,
487 					  0xc600 /* -6 */, 0xc500 /* -6 */,
488 					  0xc400 /* -4 */, 0xc200 /* -3 */,
489 					  0xc000 /* -2 */, 0xbc00 /* -1 */,
490 					  0, 0x3c00 /* 1 */,
491 					  0x4000 /* 2 */, 0x4200 /* 3 */,
492 					  0x4400 /* 4 */, 0x4500 /* 5 */,
493 					  0x4600 /* 6 */, 0x4700 /* 7 */,
494 					  0x4800 /* 8 */, 0x4880 /* 9 */,
495 					  0x4900 /* 10 */, 0x4980 /* 11 */,
496 					  0x4a00 /* 12 */, 0x4a80 /* 13 */,
497 					  0x4b00 /* 14 */, 0x04b80 /* 15 */};
498 #endif
499 PAD(buffer_vld4_pad, float, 16, 8);
500 #endif
501 
502 /* Input buffers for vld2_lane */
503 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 8, 2);
504 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 16, 2);
505 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 32, 2);
506 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 64, 2);
507 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 8, 2);
508 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 16, 2);
509 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
510 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
511 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
512 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
513 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
514 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
515 #ifdef __ARMCC_VERSION
516 __fp16 buffer_vld2_lane_float16x2[2] = {-16, -15};
517 #else
518 VECT_VAR_DECL(buffer_vld2_lane, float, 16, 2) [] = {0xcc00 /* -16 */,
519 						    0xcb80 /* -15 */};
520 #endif
521 #endif
522 
523 /* Input buffers for vld3_lane */
524 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 8, 3);
525 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 16, 3);
526 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 32, 3);
527 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 64, 3);
528 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 8, 3);
529 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 16, 3);
530 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
531 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
532 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
533 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
534 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
535 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
536 #ifdef __ARMCC_VERSION
537 __fp16 buffer_vld3_lane_float16x3[3] = {-16, -15, -14};
538 #else
539 VECT_VAR_DECL(buffer_vld3_lane, float, 16, 3) [] = {0xcc00 /* -16 */,
540 						    0xcb80 /* -15 */,
541 						    0xcb00 /* -14 */};
542 #endif
543 #endif
544 
545 /* Input buffers for vld4_lane */
546 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 8, 4);
547 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 16, 4);
548 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 32, 4);
549 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 64, 4);
550 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 8, 4);
551 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 16, 4);
552 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
553 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
554 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
555 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
556 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
557 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
558 #ifdef __ARMCC_VERSION
559 __fp16 buffer_vld4_lane_float16x4[4] = {-16, -15, -14, -13};
560 #else
561 VECT_VAR_DECL(buffer_vld4_lane, float, 16, 4) [] = {0xcc00 /* -16 */,
562 						    0xcb80 /* -15 */,
563 						    0xcb00 /* -14 */,
564 						    0xca80 /* -13 */};
565 #endif
566 #endif
567