1 /*
2 
3 Copyright (c) 2009, 2010, 2011, 2012, 2013 STMicroelectronics
4 Written by Christophe Lyon
5 
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12 
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15 
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 
24 */
25 
26 #ifndef _STM_ARM_NEON_REF_H_
27 #define _STM_ARM_NEON_REF_H_
28 
29 #if defined(__cplusplus)
30 #include <cstdio>
31 #include <cinttypes>
32 #include <cstring>
33 #else
34 #include <stdio.h>
35 #if defined(_MSC_VER)
36 #include "msinttypes.h"
37 #include <float.h> /* for isnan() ... */
38 static int32_t _ptrNan[]={0x7fc00000L};
39 #define NAN (*(float*)_ptrNan)
40 static int32_t _ptrInf[]={0x7f800000L};
41 #define INFINITY (*(float*)_ptrInf)
42 #define HUGE_VALF INFINITY
43 #else
44 #include <inttypes.h>
45 #endif
46 #include <string.h>
47 #endif
48 
49 #define xSTR(X) #X
50 #define STR(X) xSTR(X)
51 
52 #define xNAME1(V,T) V ## _ ##  T
53 #define xNAME(V,T) xNAME1(V,T)
54 
55 #define VAR(V,T,W) xNAME(V,T##W)
56 #define VAR_DECL(V, T, W) T##W##_t VAR(V,T,W)
57 
58 #define VECT_NAME(T, W, N) T##W##x##N
59 #define VECT_ARRAY_NAME(T, W, N, L) T##W##x##N##x##L
60 #define VECT_TYPE(T, W, N) xNAME(VECT_NAME(T,W,N),t)
61 #define VECT_ARRAY_TYPE(T, W, N, L) xNAME(VECT_ARRAY_NAME(T,W,N,L),t)
62 
63 #define VECT_VAR(V,T,W,N) xNAME(V,VECT_NAME(T,W,N))
64 #define VECT_VAR_DECL(V, T, W, N) T##W##_t VECT_VAR(V,T,W,N)
65 
66 /* This one is used for padding between input buffers.  */
67 #define PAD(V, T, W, N) char VECT_VAR(V,T,W,N)=42;
68 
69 /* Array declarations.  */
70 #define ARRAY(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[N]
71 #define ARRAY4(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[4]
72 
73 /* Arrays of vectors.  */
74 #define VECT_ARRAY_VAR(V,T,W,N,L) xNAME(V,VECT_ARRAY_NAME(T,W,N,L))
75 #define VECT_ARRAY(V, T, W, N, L) T##W##_t VECT_ARRAY_VAR(V,T,W,N,L)[N*L]
76 
77 static int result_idx = 0;
78 #define DUMP(MSG,T,W,N,FMT)						\
79   fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
80 	  STR(VECT_VAR(result, T, W, N)));				\
81   for(i=0; i<N ; i++)							\
82     {									\
83       fprintf(ref_file, "%" FMT ", ", VECT_VAR(result, T, W, N)[i]);	\
84     }									\
85   fprintf(ref_file, " }\n");						\
86   DUMP4GCC(MSG,T,W,N,FMT);
87 
88 /* Use casts for remove sign bits */
89 #define DUMP_POLY(MSG,T,W,N,FMT)					\
90   fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
91 	  STR(VECT_VAR(result, T, W, N)));				\
92   for(i=0; i<N ; i++)							\
93     {									\
94       fprintf(ref_file, "%" FMT ", ",					\
95 	      (uint##W##_t)VECT_VAR(result, T, W, N)[i]);		\
96     }									\
97   fprintf(ref_file, " }\n");						\
98   DUMP4GCC(MSG,T,W,N,FMT);
99 
100 #define DUMP_FP(MSG,T,W,N,FMT)						\
101   fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
102 	  STR(VECT_VAR(result, T, W, N)));				\
103   for(i=0; i<N ; i++)							\
104     {									\
105       union fp_operand {						\
106 	uint##W##_t i;							\
107 	float##W##_t f;							\
108       } tmp;								\
109       tmp.f = VECT_VAR(result, T, W, N)[i];				\
110       fprintf(ref_file, "%" FMT ", ", tmp.i);				\
111     }									\
112   fprintf(ref_file, " }\n");						\
113   DUMP4GCC_FP(MSG,T,W,N,FMT);
114 
115 #define DUMP4GCC(MSG,T,W,N,FMT)						\
116   fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ",	\
117 	  STR(T), W, N);						\
118   for(i=0; i<(N-1) ; i++)						\
119     {									\
120       if (W < 32) {							\
121 	uint32_t tmp = (uint##W##_t) VECT_VAR(result, T, W, N)[i];	\
122 	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp);			\
123       } else {								\
124 	fprintf(gcc_tests_file, "0x%" FMT ", ", VECT_VAR(result, T, W, N)[i]); \
125       }									\
126     }									\
127   if (W < 32) {								\
128     uint32_t tmp = (uint##W##_t) VECT_VAR(result, T, W, N)[i];		\
129     fprintf(gcc_tests_file, "0x%" FMT, tmp);				\
130   } else {								\
131     fprintf(gcc_tests_file, "0x%" FMT, VECT_VAR(result, T, W, N)[i]);	\
132   }									\
133   fprintf(gcc_tests_file, " };\n");
134 
135 #define DUMP4GCC_FP(MSG,T,W,N,FMT)					\
136   {									\
137     union fp_operand {							\
138       uint##W##_t i;							\
139       float##W##_t f;							\
140     } tmp;								\
141     fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ",	\
142 	    "hfloat", W, N);						\
143     for(i=0; i<(N-1) ; i++)						\
144       {									\
145 	tmp.f = VECT_VAR(result, T, W, N)[i];				\
146 	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp.i);			\
147       }									\
148     tmp.f = VECT_VAR(result, T, W, N)[i];				\
149     fprintf(gcc_tests_file, "0x%" FMT, tmp.i);				\
150     fprintf(gcc_tests_file, " };\n");					\
151   }
152 
153 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
154 #define float16_t __fp16
155 
156 #define DUMP_FP16(MSG,T,W,N,FMT)					\
157   fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
158 	  STR(VECT_VAR(result, T, W, N)));				\
159   for(i=0; i<N ; i++)							\
160     {									\
161       uint##W##_t tmp;							\
162 	tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];		\
163 	fprintf(ref_file, "%" FMT ", ", tmp);				\
164     }									\
165   fprintf(ref_file, " }\n");						\
166   DUMP4GCC_FP16(MSG,T,W,N,FMT);
167 
168 #define DUMP4GCC_FP16(MSG,T,W,N,FMT)					\
169   {									\
170     uint##W##_t tmp;							\
171     fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ", \
172 	    "hfloat", W, N);						\
173     for(i=0; i<(N-1) ; i++)						\
174       {									\
175 	tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];		\
176 	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp);			\
177       }									\
178     tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];			\
179     fprintf(gcc_tests_file, "0x%" FMT, tmp);				\
180     fprintf(gcc_tests_file, " };\n");					\
181   }
182 #endif
183 
184 #define CLEAN_PATTERN_8  0x33
185 #define CLEAN_PATTERN_16 0x3333
186 #define CLEAN_PATTERN_32 0x33333333
187 #define CLEAN_PATTERN_64 0x3333333333333333
188 
189 #define CLEAN(VAR,T,W,N)						\
190   memset(VECT_VAR(VAR, T, W, N),					\
191 	 CLEAN_PATTERN_8,						\
192 	 sizeof(VECT_VAR(VAR, T, W, N)));
193 
194 #define CHECK_INIT(VAR,Q,T1,T2,W,N)					\
195   {									\
196     ARRAY(check_result, T1, W, N);					\
197     int i;								\
198 									\
199     vst1##Q##_##T2##W(VECT_VAR(check_result, T1, W, N),			\
200 		      VECT_VAR(VAR, T1, W, N));				\
201     for(i=0; i<N ; i++)							\
202       {									\
203 	/*if (VECT_VAR(check_result, T1, W, N)[i] == CLEAN_PATTERN_##W)*/ { \
204 	  fprintf(stdout, "%s:%d: %s[%d] unintialized! %#x\n",		\
205 		  __FUNCTION__,	__LINE__,				\
206 		  STR(VECT_VAR(VAR, T1, W, N)), i,			\
207 		  VECT_VAR(check_result, T1, W, N)[i]);			\
208 	}								\
209       }									\
210   }
211 
212 /* Generic declarations: */
213 extern FILE* log_file;
214 extern FILE* ref_file;
215 extern FILE* gcc_tests_file;
216 
217 /* Input buffers, one of each size */
218 extern ARRAY(buffer, int, 8, 8);
219 extern ARRAY(buffer, int, 16, 4);
220 extern ARRAY(buffer, int, 32, 2);
221 extern ARRAY(buffer, int, 64, 1);
222 extern ARRAY(buffer, uint, 8, 8);
223 extern ARRAY(buffer, uint, 16, 4);
224 extern ARRAY(buffer, uint, 32, 2);
225 extern ARRAY(buffer, uint, 64, 1);
226 extern ARRAY(buffer, poly, 8, 8);
227 extern ARRAY(buffer, poly, 16, 4);
228 extern ARRAY(buffer, float, 32, 2);
229 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
230 extern ARRAY(buffer, float, 16, 4);
231 #endif
232 extern ARRAY(buffer, int, 8, 16);
233 extern ARRAY(buffer, int, 16, 8);
234 extern ARRAY(buffer, int, 32, 4);
235 extern ARRAY(buffer, int, 64, 2);
236 extern ARRAY(buffer, uint, 8, 16);
237 extern ARRAY(buffer, uint, 16, 8);
238 extern ARRAY(buffer, uint, 32, 4);
239 extern ARRAY(buffer, uint, 64, 2);
240 extern ARRAY(buffer, poly, 8, 16);
241 extern ARRAY(buffer, poly, 16, 8);
242 extern ARRAY(buffer, float, 32, 4);
243 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
244 extern ARRAY(buffer, float, 16, 8);
245 #endif
246 
247 /* The tests for vld1_dup and vdup expect at least 4 entries in the
248    input buffer, so force 1- and 2-elements initializers to have 4
249    entries.  */
250 extern ARRAY(buffer_dup, int, 8, 8);
251 extern ARRAY(buffer_dup, int, 16, 4);
252 extern ARRAY4(buffer_dup, int, 32, 2);
253 extern ARRAY4(buffer_dup, int, 64, 1);
254 extern ARRAY(buffer_dup, uint, 8, 8);
255 extern ARRAY(buffer_dup, uint, 16, 4);
256 extern ARRAY4(buffer_dup, uint, 32, 2);
257 extern ARRAY4(buffer_dup, uint, 64, 1);
258 extern ARRAY(buffer_dup, poly, 8, 8);
259 extern ARRAY(buffer_dup, poly, 16, 4);
260 extern ARRAY4(buffer_dup, float, 32, 2);
261 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
262 extern ARRAY4(buffer_dup, float, 16, 4);
263 #endif
264 extern ARRAY(buffer_dup, int, 8, 16);
265 extern ARRAY(buffer_dup, int, 16, 8);
266 extern ARRAY(buffer_dup, int, 32, 4);
267 extern ARRAY4(buffer_dup, int, 64, 2);
268 extern ARRAY(buffer_dup, uint, 8, 16);
269 extern ARRAY(buffer_dup, uint, 16, 8);
270 extern ARRAY(buffer_dup, uint, 32, 4);
271 extern ARRAY4(buffer_dup, uint, 64, 2);
272 extern ARRAY(buffer_dup, poly, 8, 16);
273 extern ARRAY(buffer_dup, poly, 16, 8);
274 extern ARRAY(buffer_dup, float, 32, 4);
275 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
276 extern ARRAY(buffer_dup, float, 16, 8);
277 #endif
278 
279 /* Input buffers for vld2, one of each size */
280 extern VECT_ARRAY(buffer_vld2, int, 8, 8, 2);
281 extern VECT_ARRAY(buffer_vld2, int, 16, 4, 2);
282 extern VECT_ARRAY(buffer_vld2, int, 32, 2, 2);
283 extern VECT_ARRAY(buffer_vld2, int, 64, 1, 2);
284 extern VECT_ARRAY(buffer_vld2, uint, 8, 8, 2);
285 extern VECT_ARRAY(buffer_vld2, uint, 16, 4, 2);
286 extern VECT_ARRAY(buffer_vld2, uint, 32, 2, 2);
287 extern VECT_ARRAY(buffer_vld2, uint, 64, 1, 2);
288 extern VECT_ARRAY(buffer_vld2, poly, 8, 8, 2);
289 extern VECT_ARRAY(buffer_vld2, poly, 16, 4, 2);
290 extern VECT_ARRAY(buffer_vld2, float, 32, 2, 2);
291 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
292 extern VECT_ARRAY(buffer_vld2, float, 16, 4, 2);
293 #endif
294 extern VECT_ARRAY(buffer_vld2, int, 8, 16, 2);
295 extern VECT_ARRAY(buffer_vld2, int, 16, 8, 2);
296 extern VECT_ARRAY(buffer_vld2, int, 32, 4, 2);
297 extern VECT_ARRAY(buffer_vld2, int, 64, 2, 2);
298 extern VECT_ARRAY(buffer_vld2, uint, 8, 16, 2);
299 extern VECT_ARRAY(buffer_vld2, uint, 16, 8, 2);
300 extern VECT_ARRAY(buffer_vld2, uint, 32, 4, 2);
301 extern VECT_ARRAY(buffer_vld2, uint, 64, 2, 2);
302 extern VECT_ARRAY(buffer_vld2, poly, 8, 16, 2);
303 extern VECT_ARRAY(buffer_vld2, poly, 16, 8, 2);
304 extern VECT_ARRAY(buffer_vld2, float, 32, 4, 2);
305 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
306 extern VECT_ARRAY(buffer_vld2, float, 16, 8, 2);
307 #endif
308 
309 /* Input buffers for vld3, one of each size */
310 extern VECT_ARRAY(buffer_vld3, int, 8, 8, 3);
311 extern VECT_ARRAY(buffer_vld3, int, 16, 4, 3);
312 extern VECT_ARRAY(buffer_vld3, int, 32, 2, 3);
313 extern VECT_ARRAY(buffer_vld3, int, 64, 1, 3);
314 extern VECT_ARRAY(buffer_vld3, uint, 8, 8, 3);
315 extern VECT_ARRAY(buffer_vld3, uint, 16, 4, 3);
316 extern VECT_ARRAY(buffer_vld3, uint, 32, 2, 3);
317 extern VECT_ARRAY(buffer_vld3, uint, 64, 1, 3);
318 extern VECT_ARRAY(buffer_vld3, poly, 8, 8, 3);
319 extern VECT_ARRAY(buffer_vld3, poly, 16, 4, 3);
320 extern VECT_ARRAY(buffer_vld3, float, 32, 2, 3);
321 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
322 extern VECT_ARRAY(buffer_vld3, float, 16, 4, 3);
323 #endif
324 extern VECT_ARRAY(buffer_vld3, int, 8, 16, 3);
325 extern VECT_ARRAY(buffer_vld3, int, 16, 8, 3);
326 extern VECT_ARRAY(buffer_vld3, int, 32, 4, 3);
327 extern VECT_ARRAY(buffer_vld3, int, 64, 2, 3);
328 extern VECT_ARRAY(buffer_vld3, uint, 8, 16, 3);
329 extern VECT_ARRAY(buffer_vld3, uint, 16, 8, 3);
330 extern VECT_ARRAY(buffer_vld3, uint, 32, 4, 3);
331 extern VECT_ARRAY(buffer_vld3, uint, 64, 2, 3);
332 extern VECT_ARRAY(buffer_vld3, poly, 8, 16, 3);
333 extern VECT_ARRAY(buffer_vld3, poly, 16, 8, 3);
334 extern VECT_ARRAY(buffer_vld3, float, 32, 4, 3);
335 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
336 extern VECT_ARRAY(buffer_vld3, float, 16, 8, 3);
337 #endif
338 
339 /* Input buffers for vld4, one of each size */
340 extern VECT_ARRAY(buffer_vld4, int, 8, 8, 4);
341 extern VECT_ARRAY(buffer_vld4, int, 16, 4, 4);
342 extern VECT_ARRAY(buffer_vld4, int, 32, 2, 4);
343 extern VECT_ARRAY(buffer_vld4, int, 64, 1, 4);
344 extern VECT_ARRAY(buffer_vld4, uint, 8, 8, 4);
345 extern VECT_ARRAY(buffer_vld4, uint, 16, 4, 4);
346 extern VECT_ARRAY(buffer_vld4, uint, 32, 2, 4);
347 extern VECT_ARRAY(buffer_vld4, uint, 64, 1, 4);
348 extern VECT_ARRAY(buffer_vld4, poly, 8, 8, 4);
349 extern VECT_ARRAY(buffer_vld4, poly, 16, 4, 4);
350 extern VECT_ARRAY(buffer_vld4, float, 32, 2, 4);
351 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
352 extern VECT_ARRAY(buffer_vld4, float, 16, 4, 4);
353 #endif
354 extern VECT_ARRAY(buffer_vld4, int, 8, 16, 4);
355 extern VECT_ARRAY(buffer_vld4, int, 16, 8, 4);
356 extern VECT_ARRAY(buffer_vld4, int, 32, 4, 4);
357 extern VECT_ARRAY(buffer_vld4, int, 64, 2, 4);
358 extern VECT_ARRAY(buffer_vld4, uint, 8, 16, 4);
359 extern VECT_ARRAY(buffer_vld4, uint, 16, 8, 4);
360 extern VECT_ARRAY(buffer_vld4, uint, 32, 4, 4);
361 extern VECT_ARRAY(buffer_vld4, uint, 64, 2, 4);
362 extern VECT_ARRAY(buffer_vld4, poly, 8, 16, 4);
363 extern VECT_ARRAY(buffer_vld4, poly, 16, 8, 4);
364 extern VECT_ARRAY(buffer_vld4, float, 32, 4, 4);
365 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
366 extern VECT_ARRAY(buffer_vld4, float, 16, 8, 4);
367 #endif
368 
369 /* Input buffers for vld2_lane */
370 extern VECT_VAR_DECL(buffer_vld2_lane, int, 8, 2)[2];
371 extern VECT_VAR_DECL(buffer_vld2_lane, int, 16, 2)[2];
372 extern VECT_VAR_DECL(buffer_vld2_lane, int, 32, 2)[2];
373 extern VECT_VAR_DECL(buffer_vld2_lane, int, 64, 2)[2];
374 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 8, 2)[2];
375 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 16, 2)[2];
376 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 32, 2)[2];
377 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 64, 2)[2];
378 extern VECT_VAR_DECL(buffer_vld2_lane, poly, 8, 2)[2];
379 extern VECT_VAR_DECL(buffer_vld2_lane, poly, 16, 2)[2];
380 extern VECT_VAR_DECL(buffer_vld2_lane, float, 32, 2)[2];
381 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
382 extern VECT_VAR_DECL(buffer_vld2_lane, float, 16, 2)[2];
383 #endif
384 
385 /* Input buffers for vld3_lane */
386 extern VECT_VAR_DECL(buffer_vld3_lane, int, 8, 3)[3];
387 extern VECT_VAR_DECL(buffer_vld3_lane, int, 16, 3)[3];
388 extern VECT_VAR_DECL(buffer_vld3_lane, int, 32, 3)[3];
389 extern VECT_VAR_DECL(buffer_vld3_lane, int, 64, 3)[3];
390 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 8, 3)[3];
391 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 16, 3)[3];
392 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 32, 3)[3];
393 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 64, 3)[3];
394 extern VECT_VAR_DECL(buffer_vld3_lane, poly, 8, 3)[3];
395 extern VECT_VAR_DECL(buffer_vld3_lane, poly, 16, 3)[3];
396 extern VECT_VAR_DECL(buffer_vld3_lane, float, 32, 3)[3];
397 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
398 extern VECT_VAR_DECL(buffer_vld3_lane, float, 16, 3)[3];
399 #endif
400 
401 /* Input buffers for vld4_lane */
402 extern VECT_VAR_DECL(buffer_vld4_lane, int, 8, 4)[4];
403 extern VECT_VAR_DECL(buffer_vld4_lane, int, 16, 4)[4];
404 extern VECT_VAR_DECL(buffer_vld4_lane, int, 32, 4)[4];
405 extern VECT_VAR_DECL(buffer_vld4_lane, int, 64, 4)[4];
406 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 8, 4)[4];
407 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 16, 4)[4];
408 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 32, 4)[4];
409 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 64, 4)[4];
410 extern VECT_VAR_DECL(buffer_vld4_lane, poly, 8, 4)[4];
411 extern VECT_VAR_DECL(buffer_vld4_lane, poly, 16, 4)[4];
412 extern VECT_VAR_DECL(buffer_vld4_lane, float, 32, 4)[4];
413 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
414 extern VECT_VAR_DECL(buffer_vld4_lane, float, 16, 4)[4];
415 #endif
416 
417 /* Output buffers, one of each size */
418 static ARRAY(result, int, 8, 8);
419 static ARRAY(result, int, 16, 4);
420 static ARRAY(result, int, 32, 2);
421 static ARRAY(result, int, 64, 1);
422 static ARRAY(result, uint, 8, 8);
423 static ARRAY(result, uint, 16, 4);
424 static ARRAY(result, uint, 32, 2);
425 static ARRAY(result, uint, 64, 1);
426 static ARRAY(result, poly, 8, 8);
427 static ARRAY(result, poly, 16, 4);
428 static ARRAY(result, float, 32, 2);
429 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
430 static ARRAY(result, float, 16, 4);
431 #endif
432 static ARRAY(result, int, 8, 16);
433 static ARRAY(result, int, 16, 8);
434 static ARRAY(result, int, 32, 4);
435 static ARRAY(result, int, 64, 2);
436 static ARRAY(result, uint, 8, 16);
437 static ARRAY(result, uint, 16, 8);
438 static ARRAY(result, uint, 32, 4);
439 static ARRAY(result, uint, 64, 2);
440 static ARRAY(result, poly, 8, 16);
441 static ARRAY(result, poly, 16, 8);
442 static ARRAY(result, float, 32, 4);
443 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
444 static ARRAY(result, float, 16, 8);
445 #endif
446 
447 /* Dump results (generic function) */
dump_results(char * test_name)448 static void dump_results (char *test_name)
449 {
450   int i;
451 
452   fprintf(ref_file, "\n%s output:\n", test_name);
453   fprintf(gcc_tests_file, "\n%s output:\n", test_name);
454 
455   DUMP(test_name, int, 8, 8, PRId8);
456   DUMP(test_name, int, 16, 4, PRId16);
457   DUMP(test_name, int, 32, 2, PRId32);
458   DUMP(test_name, int, 64, 1, PRId64);
459   DUMP(test_name, uint, 8, 8, PRIu8);
460   DUMP(test_name, uint, 16, 4, PRIu16);
461   DUMP(test_name, uint, 32, 2, PRIu32);
462   DUMP(test_name, uint, 64, 1, PRIu64);
463   DUMP_POLY(test_name, poly, 8, 8, PRIu8);
464   DUMP_POLY(test_name, poly, 16, 4, PRIu16);
465   DUMP_FP(test_name, float, 32, 2, PRIx32);
466 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
467   DUMP_FP16(test_name, float, 16, 4, PRIu16);
468 #endif
469 
470   DUMP(test_name, int, 8, 16, PRId8);
471   DUMP(test_name, int, 16, 8, PRId16);
472   DUMP(test_name, int, 32, 4, PRId32);
473   DUMP(test_name, int, 64, 2, PRId64);
474   DUMP(test_name, uint, 8, 16, PRIu8);
475   DUMP(test_name, uint, 16, 8, PRIu16);
476   DUMP(test_name, uint, 32, 4, PRIu32);
477   DUMP(test_name, uint, 64, 2, PRIu64);
478   DUMP_POLY(test_name, poly, 8, 16, PRIu8);
479   DUMP_POLY(test_name, poly, 16, 8, PRIu16);
480   DUMP_FP(test_name, float, 32, 4, PRIx32);
481 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
482   DUMP_FP16(test_name, float, 16, 8, PRIu16);
483 #endif
484 }
485 
486 /* Dump results in hex (generic function) */
dump_results_hex2(const char * test_name,const char * comment)487 static void dump_results_hex2 (const char *test_name, const char* comment)
488 {
489   int i;
490 
491   fprintf(ref_file, "\n%s%s output:\n", test_name, comment);
492   fprintf(gcc_tests_file, "\n%s%s output:\n", test_name, comment);
493 
494   DUMP(test_name, int, 8, 8, PRIx8);
495   DUMP(test_name, int, 16, 4, PRIx16);
496   DUMP(test_name, int, 32, 2, PRIx32);
497   DUMP(test_name, int, 64, 1, PRIx64);
498   DUMP(test_name, uint, 8, 8, PRIx8);
499   DUMP(test_name, uint, 16, 4, PRIx16);
500   DUMP(test_name, uint, 32, 2, PRIx32);
501   DUMP(test_name, uint, 64, 1, PRIx64);
502   DUMP_POLY(test_name, poly, 8, 8, PRIx8);
503   DUMP_POLY(test_name, poly, 16, 4, PRIx16);
504   DUMP_FP(test_name, float, 32, 2, PRIx32);
505 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
506   DUMP_FP16(test_name, float, 16, 4, PRIx16);
507 #endif
508 
509   DUMP(test_name, int, 8, 16, PRIx8);
510   DUMP(test_name, int, 16, 8, PRIx16);
511   DUMP(test_name, int, 32, 4, PRIx32);
512   DUMP(test_name, int, 64, 2, PRIx64);
513   DUMP(test_name, uint, 8, 16, PRIx8);
514   DUMP(test_name, uint, 16, 8, PRIx16);
515   DUMP(test_name, uint, 32, 4, PRIx32);
516   DUMP(test_name, uint, 64, 2, PRIx64);
517   DUMP_POLY(test_name, poly, 8, 16, PRIx8);
518   DUMP_POLY(test_name, poly, 16, 8, PRIx16);
519   DUMP_FP(test_name, float, 32, 4, PRIx32);
520 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
521   DUMP_FP16(test_name, float, 16, 8, PRIx16);
522 #endif
523 }
524 
dump_results_hex(const char * test_name)525 static void dump_results_hex (const char *test_name)
526 {
527   dump_results_hex2(test_name, "");
528 }
529 
530 #ifndef STM_ARM_NEON_MODELS
531 
532 /* This hack is to cope with various compilers/libc which may not
533    provide endian.h or cross-compilers such as llvm which includes the
534    host's endian.h.  */
535 #ifndef __arm__
536 #include <endian.h>
537 #define THIS_ENDIAN __BYTE_ORDER
538 #else /* __arm__ */
539 #ifdef __ARMEL__
540 #define THIS_ENDIAN __LITTLE_ENDIAN
541 #else /* __ARMEL__ */
542 #define THIS_ENDIAN __BIG_ENDIAN
543 #endif
544 #endif /* __arm__ */
545 
546 #if THIS_ENDIAN == __LITTLE_ENDIAN
547 
548 typedef union {
549   struct {
550     int _xxx:27;
551     unsigned int QC:1;
552     int V:1;
553     int C:1;
554     int Z:1;
555     int N:1;
556   } b;
557   unsigned int word;
558 } _ARM_FPSCR;
559 
560 #else /* __BIG_ENDIAN */
561 
562 typedef union {
563   struct {
564     int N:1;
565     int Z:1;
566     int C:1;
567     int V:1;
568     unsigned int QC:1;
569     int _dnm:27;
570   } b;
571   unsigned int word;
572 } _ARM_FPSCR;
573 
574 #endif /* __BIG_ENDIAN */
575 
576 #ifdef __ARMCC_VERSION
577 register _ARM_FPSCR _afpscr_for_qc __asm("fpscr");
578 # define Neon_Cumulative_Sat _afpscr_for_qc.b.QC
579 # define Set_Neon_Cumulative_Sat(x, depend)  {Neon_Cumulative_Sat = (x);}
580 #else
581 /* GCC/ARM does not know this register */
582 # define Neon_Cumulative_Sat  __read_neon_cumulative_sat()
583 /* We need a fake dependency to ensure correct ordering of asm
584    statements to preset the QC flag value, and Neon operators writing
585    to QC. */
586 #define Set_Neon_Cumulative_Sat(x, depend)	\
587   __set_neon_cumulative_sat((x), (depend))
588 
589 # if defined(__aarch64__)
__read_neon_cumulative_sat(void)590 static volatile int __read_neon_cumulative_sat (void) {
591     _ARM_FPSCR _afpscr_for_qc;
592     asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
593     return _afpscr_for_qc.b.QC;
594 }
595 
596 #define __set_neon_cumulative_sat(x, depend) {				\
597     _ARM_FPSCR _afpscr_for_qc;						\
598     asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));		\
599     _afpscr_for_qc.b.QC = x;						\
600     asm volatile ("msr fpsr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \
601   }
602 
603 # else
__read_neon_cumulative_sat(void)604 static volatile int __read_neon_cumulative_sat (void) {
605     _ARM_FPSCR _afpscr_for_qc;
606     asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc));
607     return _afpscr_for_qc.b.QC;
608 }
609 
610 #define __set_neon_cumulative_sat(x, depend) {				\
611     _ARM_FPSCR _afpscr_for_qc;						\
612     asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc));		\
613     _afpscr_for_qc.b.QC = x;						\
614     asm volatile ("vmsr fpscr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \
615   }
616 
617 # endif
618 #endif
619 
620 #endif /* STM_ARM_NEON_MODELS */
621 
dump_neon_cumulative_sat(const char * msg,const char * name,const char * t1,int w,int n)622 static void dump_neon_cumulative_sat(const char* msg, const char *name,
623 				     const char* t1, int w, int n)
624 {
625   fprintf(ref_file, "%s:%d:%s Neon cumulative saturation %d\n", msg, result_idx++,
626 	  name, Neon_Cumulative_Sat);
627   fprintf(gcc_tests_file,
628 	  "int VECT_VAR(expected_cumulative_sat,%s,%d,%d) = %d;\n",
629 	  t1, w, n, Neon_Cumulative_Sat);
630 }
631 
632 /* Clean output buffers before execution */
clean_results(void)633 static void clean_results (void)
634 {
635   result_idx = 0;
636   CLEAN(result, int, 8, 8);
637   CLEAN(result, int, 16, 4);
638   CLEAN(result, int, 32, 2);
639   CLEAN(result, int, 64, 1);
640   CLEAN(result, uint, 8, 8);
641   CLEAN(result, uint, 16, 4);
642   CLEAN(result, uint, 32, 2);
643   CLEAN(result, uint, 64, 1);
644   CLEAN(result, poly, 8, 8);
645   CLEAN(result, poly, 16, 4);
646   CLEAN(result, float, 32, 2);
647 
648   CLEAN(result, int, 8, 16);
649   CLEAN(result, int, 16, 8);
650   CLEAN(result, int, 32, 4);
651   CLEAN(result, int, 64, 2);
652   CLEAN(result, uint, 8, 16);
653   CLEAN(result, uint, 16, 8);
654   CLEAN(result, uint, 32, 4);
655   CLEAN(result, uint, 64, 2);
656   CLEAN(result, poly, 8, 16);
657   CLEAN(result, poly, 16, 8);
658   CLEAN(result, float, 32, 4);
659 }
660 
661 
662 /* Helpers to declare variables of various types  */
663 #define DECL_VARIABLE(VAR, T1, W, N)		\
664   volatile VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N)
665 
666 #define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR)	\
667   DECL_VARIABLE(VAR, int, 8, 8);			\
668   DECL_VARIABLE(VAR, int, 16, 4);			\
669   DECL_VARIABLE(VAR, int, 32, 2);			\
670   DECL_VARIABLE(VAR, int, 64, 1)
671 
672 #define DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR)	\
673   DECL_VARIABLE(VAR, uint, 8, 8);			\
674   DECL_VARIABLE(VAR, uint, 16, 4);			\
675   DECL_VARIABLE(VAR, uint, 32, 2);			\
676   DECL_VARIABLE(VAR, uint, 64, 1)
677 
678 #define DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR)	\
679   DECL_VARIABLE(VAR, int, 8, 16);			\
680   DECL_VARIABLE(VAR, int, 16, 8);			\
681   DECL_VARIABLE(VAR, int, 32, 4);			\
682   DECL_VARIABLE(VAR, int, 64, 2)
683 
684 #define DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR)	\
685   DECL_VARIABLE(VAR, uint, 8, 16);			\
686   DECL_VARIABLE(VAR, uint, 16, 8);			\
687   DECL_VARIABLE(VAR, uint, 32, 4);			\
688   DECL_VARIABLE(VAR, uint, 64, 2)
689 
690 #define DECL_VARIABLE_64BITS_VARIANTS(VAR)	\
691   DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR);	\
692   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
693   DECL_VARIABLE(VAR, poly, 8, 8);		\
694   DECL_VARIABLE(VAR, poly, 16, 4);		\
695   DECL_VARIABLE(VAR, float, 32, 2)
696 
697 #define DECL_VARIABLE_128BITS_VARIANTS(VAR)	\
698   DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR);	\
699   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
700   DECL_VARIABLE(VAR, poly, 8, 16);		\
701   DECL_VARIABLE(VAR, poly, 16, 8);		\
702   DECL_VARIABLE(VAR, float, 32, 4)
703 
704 #define DECL_VARIABLE_ALL_VARIANTS(VAR)		\
705   DECL_VARIABLE_64BITS_VARIANTS(VAR);		\
706   DECL_VARIABLE_128BITS_VARIANTS(VAR)
707 
708 #define DECL_VARIABLE_SIGNED_VARIANTS(VAR)	\
709   DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR);	\
710   DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR)
711 
712 #define DECL_VARIABLE_UNSIGNED_VARIANTS(VAR)	\
713   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
714   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR)
715 
716 /* Helpers to initialize vectors */
717 #define VDUP(VAR, Q, T1, T2, W, N, V)		\
718   VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)
719 
720 #define TEST_VSET_LANE(VAR, Q, T1, T2, W, N, L, V)			\
721   VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V,			\
722 						   VECT_VAR(VAR, T1, W, N), \
723 						   L)
724 
725 /* We need to load initial values first, so rely on VLD1 */
726 #define VLOAD(VAR, BUF, Q, T1, T2, W, N)				\
727   VECT_VAR(VAR, T1, W, N) = vld1##Q##_##T2##W(VECT_VAR(BUF, T1, W, N))
728 
729 /* Helpers for macros with 1 constant and 5 variable arguments */
730 #define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
731   MACRO(VAR, , int, s, 8, 8);					\
732   MACRO(VAR, , int, s, 16, 4);					\
733   MACRO(VAR, , int, s, 32, 2);					\
734   MACRO(VAR, , int, s, 64, 1)
735 
736 #define TEST_MACRO_64BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)	\
737   MACRO(VAR, , uint, u, 8, 8);					\
738   MACRO(VAR, , uint, u, 16, 4);					\
739   MACRO(VAR, , uint, u, 32, 2);					\
740   MACRO(VAR, , uint, u, 64, 1)
741 
742 #define TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
743   MACRO(VAR, q, int, s, 8, 16);					\
744   MACRO(VAR, q, int, s, 16, 8);					\
745   MACRO(VAR, q, int, s, 32, 4);					\
746   MACRO(VAR, q, int, s, 64, 2)
747 
748 #define TEST_MACRO_128BITS_UNSIGNED_VARIANTS_1_5(MACRO,VAR)	\
749   MACRO(VAR, q, uint, u, 8, 16);				\
750   MACRO(VAR, q, uint, u, 16, 8);				\
751   MACRO(VAR, q, uint, u, 32, 4);				\
752   MACRO(VAR, q, uint, u, 64, 2)
753 
754 #define TEST_MACRO_64BITS_VARIANTS_1_5(MACRO, VAR)	\
755   TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
756   TEST_MACRO_64BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)
757 
758 #define TEST_MACRO_128BITS_VARIANTS_1_5(MACRO, VAR)	\
759   TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
760   TEST_MACRO_128BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)
761 
762 #define TEST_MACRO_ALL_VARIANTS_1_5(MACRO, VAR)	\
763   TEST_MACRO_64BITS_VARIANTS_1_5(MACRO, VAR);	\
764   TEST_MACRO_128BITS_VARIANTS_1_5(MACRO, VAR)
765 
766 #define TEST_MACRO_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
767   TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
768   TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)
769 
770 /* Helpers for macros with 2 constant and 5 variable arguments */
771 #define TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
772   MACRO(VAR1, VAR2, , int, s, 8, 8);					\
773   MACRO(VAR1, VAR2, , int, s, 16, 4);					\
774   MACRO(VAR1, VAR2, , int, s, 32, 2);					\
775   MACRO(VAR1, VAR2 , , int, s, 64, 1)
776 
777 #define TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
778   MACRO(VAR1, VAR2, , uint, u, 8, 8);					\
779   MACRO(VAR1, VAR2, , uint, u, 16, 4);					\
780   MACRO(VAR1, VAR2, , uint, u, 32, 2);					\
781   MACRO(VAR1, VAR2, , uint, u, 64, 1)
782 
783 #define TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
784   MACRO(VAR1, VAR2, q, int, s, 8, 16);					\
785   MACRO(VAR1, VAR2, q, int, s, 16, 8);					\
786   MACRO(VAR1, VAR2, q, int, s, 32, 4);					\
787   MACRO(VAR1, VAR2, q, int, s, 64, 2)
788 
789 #define TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
790   MACRO(VAR1, VAR2, q, uint, u, 8, 16);					\
791   MACRO(VAR1, VAR2, q, uint, u, 16, 8);					\
792   MACRO(VAR1, VAR2, q, uint, u, 32, 4);					\
793   MACRO(VAR1, VAR2, q, uint, u, 64, 2)
794 
795 #define TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
796   TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
797   TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
798   MACRO(VAR1, VAR2, , poly, p, 8, 8);				\
799   MACRO(VAR1, VAR2, , poly, p, 16, 4)
800 
801 #define TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
802   TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
803   TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
804   MACRO(VAR1, VAR2, q, poly, p, 8, 16);				\
805   MACRO(VAR1, VAR2, q, poly, p, 16, 8)
806 
807 #define TEST_MACRO_ALL_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
808   TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
809   TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)
810 
811 #define TEST_MACRO_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
812   TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
813   TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)
814 
815 #endif /* _STM_ARM_NEON_REF_H_ */
816