• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2  
3  Copyright (c) 2009, 2010, 2011, 2012, 2013 STMicroelectronics
4  Written by Christophe Lyon
5  
6  Permission is hereby granted, free of charge, to any person obtaining a copy
7  of this software and associated documentation files (the "Software"), to deal
8  in the Software without restriction, including without limitation the rights
9  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  copies of the Software, and to permit persons to whom the Software is
11  furnished to do so, subject to the following conditions:
12  
13  The above copyright notice and this permission notice shall be included in
14  all copies or substantial portions of the Software.
15  
16  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  THE SOFTWARE.
23  
24  */
25  
26  #ifndef _STM_ARM_NEON_REF_H_
27  #define _STM_ARM_NEON_REF_H_
28  
29  #if defined(__cplusplus)
30  #include <cstdio>
31  #include <cinttypes>
32  #include <cstring>
33  #else
34  #include <stdio.h>
35  #if defined(_MSC_VER)
36  #include "msinttypes.h"
37  #include <float.h> /* for isnan() ... */
38  static int32_t _ptrNan[]={0x7fc00000L};
39  #define NAN (*(float*)_ptrNan)
40  static int32_t _ptrInf[]={0x7f800000L};
41  #define INFINITY (*(float*)_ptrInf)
42  #define HUGE_VALF INFINITY
43  #else
44  #include <inttypes.h>
45  #endif
46  #include <string.h>
47  #endif
48  
49  #define xSTR(X) #X
50  #define STR(X) xSTR(X)
51  
52  #define xNAME1(V,T) V ## _ ##  T
53  #define xNAME(V,T) xNAME1(V,T)
54  
55  #define VAR(V,T,W) xNAME(V,T##W)
56  #define VAR_DECL(V, T, W) T##W##_t VAR(V,T,W)
57  
58  #define VECT_NAME(T, W, N) T##W##x##N
59  #define VECT_ARRAY_NAME(T, W, N, L) T##W##x##N##x##L
60  #define VECT_TYPE(T, W, N) xNAME(VECT_NAME(T,W,N),t)
61  #define VECT_ARRAY_TYPE(T, W, N, L) xNAME(VECT_ARRAY_NAME(T,W,N,L),t)
62  
63  #define VECT_VAR(V,T,W,N) xNAME(V,VECT_NAME(T,W,N))
64  #define VECT_VAR_DECL(V, T, W, N) T##W##_t VECT_VAR(V,T,W,N)
65  
66  /* This one is used for padding between input buffers.  */
67  #define PAD(V, T, W, N) char VECT_VAR(V,T,W,N)=42;
68  
69  /* Array declarations.  */
70  #define ARRAY(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[N]
71  #define ARRAY4(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[4]
72  
73  /* Arrays of vectors.  */
74  #define VECT_ARRAY_VAR(V,T,W,N,L) xNAME(V,VECT_ARRAY_NAME(T,W,N,L))
75  #define VECT_ARRAY(V, T, W, N, L) T##W##_t VECT_ARRAY_VAR(V,T,W,N,L)[N*L]
76  
77  static int result_idx = 0;
78  #define DUMP(MSG,T,W,N,FMT)						\
79    fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
80  	  STR(VECT_VAR(result, T, W, N)));				\
81    for(i=0; i<N ; i++)							\
82      {									\
83        fprintf(ref_file, "%" FMT ", ", VECT_VAR(result, T, W, N)[i]);	\
84      }									\
85    fprintf(ref_file, " }\n");						\
86    DUMP4GCC(MSG,T,W,N,FMT);
87  
88  /* Use casts for remove sign bits */
89  #define DUMP_POLY(MSG,T,W,N,FMT)					\
90    fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
91  	  STR(VECT_VAR(result, T, W, N)));				\
92    for(i=0; i<N ; i++)							\
93      {									\
94        fprintf(ref_file, "%" FMT ", ",					\
95  	      (uint##W##_t)VECT_VAR(result, T, W, N)[i]);		\
96      }									\
97    fprintf(ref_file, " }\n");						\
98    DUMP4GCC(MSG,T,W,N,FMT);
99  
100  #define DUMP_FP(MSG,T,W,N,FMT)						\
101    fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
102  	  STR(VECT_VAR(result, T, W, N)));				\
103    for(i=0; i<N ; i++)							\
104      {									\
105        union fp_operand {						\
106  	uint##W##_t i;							\
107  	float##W##_t f;							\
108        } tmp;								\
109        tmp.f = VECT_VAR(result, T, W, N)[i];				\
110        fprintf(ref_file, "%" FMT ", ", tmp.i);				\
111      }									\
112    fprintf(ref_file, " }\n");						\
113    DUMP4GCC_FP(MSG,T,W,N,FMT);
114  
115  #define DUMP4GCC(MSG,T,W,N,FMT)						\
116    fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ",	\
117  	  STR(T), W, N);						\
118    for(i=0; i<(N-1) ; i++)						\
119      {									\
120        if (W < 32) {							\
121  	uint32_t tmp = (uint##W##_t) VECT_VAR(result, T, W, N)[i];	\
122  	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp);			\
123        } else {								\
124  	fprintf(gcc_tests_file, "0x%" FMT ", ", VECT_VAR(result, T, W, N)[i]); \
125        }									\
126      }									\
127    if (W < 32) {								\
128      uint32_t tmp = (uint##W##_t) VECT_VAR(result, T, W, N)[i];		\
129      fprintf(gcc_tests_file, "0x%" FMT, tmp);				\
130    } else {								\
131      fprintf(gcc_tests_file, "0x%" FMT, VECT_VAR(result, T, W, N)[i]);	\
132    }									\
133    fprintf(gcc_tests_file, " };\n");
134  
135  #define DUMP4GCC_FP(MSG,T,W,N,FMT)					\
136    {									\
137      union fp_operand {							\
138        uint##W##_t i;							\
139        float##W##_t f;							\
140      } tmp;								\
141      fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ",	\
142  	    "hfloat", W, N);						\
143      for(i=0; i<(N-1) ; i++)						\
144        {									\
145  	tmp.f = VECT_VAR(result, T, W, N)[i];				\
146  	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp.i);			\
147        }									\
148      tmp.f = VECT_VAR(result, T, W, N)[i];				\
149      fprintf(gcc_tests_file, "0x%" FMT, tmp.i);				\
150      fprintf(gcc_tests_file, " };\n");					\
151    }
152  
153  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
154  #define float16_t __fp16
155  
156  #define DUMP_FP16(MSG,T,W,N,FMT)					\
157    fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
158  	  STR(VECT_VAR(result, T, W, N)));				\
159    for(i=0; i<N ; i++)							\
160      {									\
161        uint##W##_t tmp;							\
162  	tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];		\
163  	fprintf(ref_file, "%" FMT ", ", tmp);				\
164      }									\
165    fprintf(ref_file, " }\n");						\
166    DUMP4GCC_FP16(MSG,T,W,N,FMT);
167  
168  #define DUMP4GCC_FP16(MSG,T,W,N,FMT)					\
169    {									\
170      uint##W##_t tmp;							\
171      fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ", \
172  	    "hfloat", W, N);						\
173      for(i=0; i<(N-1) ; i++)						\
174        {									\
175  	tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];		\
176  	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp);			\
177        }									\
178      tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];			\
179      fprintf(gcc_tests_file, "0x%" FMT, tmp);				\
180      fprintf(gcc_tests_file, " };\n");					\
181    }
182  #endif
183  
184  #define CLEAN_PATTERN_8  0x33
185  #define CLEAN_PATTERN_16 0x3333
186  #define CLEAN_PATTERN_32 0x33333333
187  #define CLEAN_PATTERN_64 0x3333333333333333
188  
189  #define CLEAN(VAR,T,W,N)						\
190    memset(VECT_VAR(VAR, T, W, N),					\
191  	 CLEAN_PATTERN_8,						\
192  	 sizeof(VECT_VAR(VAR, T, W, N)));
193  
194  #define CHECK_INIT(VAR,Q,T1,T2,W,N)					\
195    {									\
196      ARRAY(check_result, T1, W, N);					\
197      int i;								\
198  									\
199      vst1##Q##_##T2##W(VECT_VAR(check_result, T1, W, N),			\
200  		      VECT_VAR(VAR, T1, W, N));				\
201      for(i=0; i<N ; i++)							\
202        {									\
203  	/*if (VECT_VAR(check_result, T1, W, N)[i] == CLEAN_PATTERN_##W)*/ { \
204  	  fprintf(stdout, "%s:%d: %s[%d] unintialized! %#x\n",		\
205  		  __FUNCTION__,	__LINE__,				\
206  		  STR(VECT_VAR(VAR, T1, W, N)), i,			\
207  		  VECT_VAR(check_result, T1, W, N)[i]);			\
208  	}								\
209        }									\
210    }
211  
212  /* Generic declarations: */
213  extern FILE* log_file;
214  extern FILE* ref_file;
215  extern FILE* gcc_tests_file;
216  
217  /* Input buffers, one of each size */
218  extern ARRAY(buffer, int, 8, 8);
219  extern ARRAY(buffer, int, 16, 4);
220  extern ARRAY(buffer, int, 32, 2);
221  extern ARRAY(buffer, int, 64, 1);
222  extern ARRAY(buffer, uint, 8, 8);
223  extern ARRAY(buffer, uint, 16, 4);
224  extern ARRAY(buffer, uint, 32, 2);
225  extern ARRAY(buffer, uint, 64, 1);
226  extern ARRAY(buffer, poly, 8, 8);
227  extern ARRAY(buffer, poly, 16, 4);
228  extern ARRAY(buffer, float, 32, 2);
229  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
230  extern ARRAY(buffer, float, 16, 4);
231  #endif
232  extern ARRAY(buffer, int, 8, 16);
233  extern ARRAY(buffer, int, 16, 8);
234  extern ARRAY(buffer, int, 32, 4);
235  extern ARRAY(buffer, int, 64, 2);
236  extern ARRAY(buffer, uint, 8, 16);
237  extern ARRAY(buffer, uint, 16, 8);
238  extern ARRAY(buffer, uint, 32, 4);
239  extern ARRAY(buffer, uint, 64, 2);
240  extern ARRAY(buffer, poly, 8, 16);
241  extern ARRAY(buffer, poly, 16, 8);
242  extern ARRAY(buffer, float, 32, 4);
243  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
244  extern ARRAY(buffer, float, 16, 8);
245  #endif
246  
247  /* The tests for vld1_dup and vdup expect at least 4 entries in the
248     input buffer, so force 1- and 2-elements initializers to have 4
249     entries.  */
250  extern ARRAY(buffer_dup, int, 8, 8);
251  extern ARRAY(buffer_dup, int, 16, 4);
252  extern ARRAY4(buffer_dup, int, 32, 2);
253  extern ARRAY4(buffer_dup, int, 64, 1);
254  extern ARRAY(buffer_dup, uint, 8, 8);
255  extern ARRAY(buffer_dup, uint, 16, 4);
256  extern ARRAY4(buffer_dup, uint, 32, 2);
257  extern ARRAY4(buffer_dup, uint, 64, 1);
258  extern ARRAY(buffer_dup, poly, 8, 8);
259  extern ARRAY(buffer_dup, poly, 16, 4);
260  extern ARRAY4(buffer_dup, float, 32, 2);
261  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
262  extern ARRAY4(buffer_dup, float, 16, 4);
263  #endif
264  extern ARRAY(buffer_dup, int, 8, 16);
265  extern ARRAY(buffer_dup, int, 16, 8);
266  extern ARRAY(buffer_dup, int, 32, 4);
267  extern ARRAY4(buffer_dup, int, 64, 2);
268  extern ARRAY(buffer_dup, uint, 8, 16);
269  extern ARRAY(buffer_dup, uint, 16, 8);
270  extern ARRAY(buffer_dup, uint, 32, 4);
271  extern ARRAY4(buffer_dup, uint, 64, 2);
272  extern ARRAY(buffer_dup, poly, 8, 16);
273  extern ARRAY(buffer_dup, poly, 16, 8);
274  extern ARRAY(buffer_dup, float, 32, 4);
275  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
276  extern ARRAY(buffer_dup, float, 16, 8);
277  #endif
278  
279  /* Input buffers for vld2, one of each size */
280  extern VECT_ARRAY(buffer_vld2, int, 8, 8, 2);
281  extern VECT_ARRAY(buffer_vld2, int, 16, 4, 2);
282  extern VECT_ARRAY(buffer_vld2, int, 32, 2, 2);
283  extern VECT_ARRAY(buffer_vld2, int, 64, 1, 2);
284  extern VECT_ARRAY(buffer_vld2, uint, 8, 8, 2);
285  extern VECT_ARRAY(buffer_vld2, uint, 16, 4, 2);
286  extern VECT_ARRAY(buffer_vld2, uint, 32, 2, 2);
287  extern VECT_ARRAY(buffer_vld2, uint, 64, 1, 2);
288  extern VECT_ARRAY(buffer_vld2, poly, 8, 8, 2);
289  extern VECT_ARRAY(buffer_vld2, poly, 16, 4, 2);
290  extern VECT_ARRAY(buffer_vld2, float, 32, 2, 2);
291  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
292  extern VECT_ARRAY(buffer_vld2, float, 16, 4, 2);
293  #endif
294  extern VECT_ARRAY(buffer_vld2, int, 8, 16, 2);
295  extern VECT_ARRAY(buffer_vld2, int, 16, 8, 2);
296  extern VECT_ARRAY(buffer_vld2, int, 32, 4, 2);
297  extern VECT_ARRAY(buffer_vld2, int, 64, 2, 2);
298  extern VECT_ARRAY(buffer_vld2, uint, 8, 16, 2);
299  extern VECT_ARRAY(buffer_vld2, uint, 16, 8, 2);
300  extern VECT_ARRAY(buffer_vld2, uint, 32, 4, 2);
301  extern VECT_ARRAY(buffer_vld2, uint, 64, 2, 2);
302  extern VECT_ARRAY(buffer_vld2, poly, 8, 16, 2);
303  extern VECT_ARRAY(buffer_vld2, poly, 16, 8, 2);
304  extern VECT_ARRAY(buffer_vld2, float, 32, 4, 2);
305  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
306  extern VECT_ARRAY(buffer_vld2, float, 16, 8, 2);
307  #endif
308  
309  /* Input buffers for vld3, one of each size */
310  extern VECT_ARRAY(buffer_vld3, int, 8, 8, 3);
311  extern VECT_ARRAY(buffer_vld3, int, 16, 4, 3);
312  extern VECT_ARRAY(buffer_vld3, int, 32, 2, 3);
313  extern VECT_ARRAY(buffer_vld3, int, 64, 1, 3);
314  extern VECT_ARRAY(buffer_vld3, uint, 8, 8, 3);
315  extern VECT_ARRAY(buffer_vld3, uint, 16, 4, 3);
316  extern VECT_ARRAY(buffer_vld3, uint, 32, 2, 3);
317  extern VECT_ARRAY(buffer_vld3, uint, 64, 1, 3);
318  extern VECT_ARRAY(buffer_vld3, poly, 8, 8, 3);
319  extern VECT_ARRAY(buffer_vld3, poly, 16, 4, 3);
320  extern VECT_ARRAY(buffer_vld3, float, 32, 2, 3);
321  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
322  extern VECT_ARRAY(buffer_vld3, float, 16, 4, 3);
323  #endif
324  extern VECT_ARRAY(buffer_vld3, int, 8, 16, 3);
325  extern VECT_ARRAY(buffer_vld3, int, 16, 8, 3);
326  extern VECT_ARRAY(buffer_vld3, int, 32, 4, 3);
327  extern VECT_ARRAY(buffer_vld3, int, 64, 2, 3);
328  extern VECT_ARRAY(buffer_vld3, uint, 8, 16, 3);
329  extern VECT_ARRAY(buffer_vld3, uint, 16, 8, 3);
330  extern VECT_ARRAY(buffer_vld3, uint, 32, 4, 3);
331  extern VECT_ARRAY(buffer_vld3, uint, 64, 2, 3);
332  extern VECT_ARRAY(buffer_vld3, poly, 8, 16, 3);
333  extern VECT_ARRAY(buffer_vld3, poly, 16, 8, 3);
334  extern VECT_ARRAY(buffer_vld3, float, 32, 4, 3);
335  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
336  extern VECT_ARRAY(buffer_vld3, float, 16, 8, 3);
337  #endif
338  
339  /* Input buffers for vld4, one of each size */
340  extern VECT_ARRAY(buffer_vld4, int, 8, 8, 4);
341  extern VECT_ARRAY(buffer_vld4, int, 16, 4, 4);
342  extern VECT_ARRAY(buffer_vld4, int, 32, 2, 4);
343  extern VECT_ARRAY(buffer_vld4, int, 64, 1, 4);
344  extern VECT_ARRAY(buffer_vld4, uint, 8, 8, 4);
345  extern VECT_ARRAY(buffer_vld4, uint, 16, 4, 4);
346  extern VECT_ARRAY(buffer_vld4, uint, 32, 2, 4);
347  extern VECT_ARRAY(buffer_vld4, uint, 64, 1, 4);
348  extern VECT_ARRAY(buffer_vld4, poly, 8, 8, 4);
349  extern VECT_ARRAY(buffer_vld4, poly, 16, 4, 4);
350  extern VECT_ARRAY(buffer_vld4, float, 32, 2, 4);
351  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
352  extern VECT_ARRAY(buffer_vld4, float, 16, 4, 4);
353  #endif
354  extern VECT_ARRAY(buffer_vld4, int, 8, 16, 4);
355  extern VECT_ARRAY(buffer_vld4, int, 16, 8, 4);
356  extern VECT_ARRAY(buffer_vld4, int, 32, 4, 4);
357  extern VECT_ARRAY(buffer_vld4, int, 64, 2, 4);
358  extern VECT_ARRAY(buffer_vld4, uint, 8, 16, 4);
359  extern VECT_ARRAY(buffer_vld4, uint, 16, 8, 4);
360  extern VECT_ARRAY(buffer_vld4, uint, 32, 4, 4);
361  extern VECT_ARRAY(buffer_vld4, uint, 64, 2, 4);
362  extern VECT_ARRAY(buffer_vld4, poly, 8, 16, 4);
363  extern VECT_ARRAY(buffer_vld4, poly, 16, 8, 4);
364  extern VECT_ARRAY(buffer_vld4, float, 32, 4, 4);
365  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
366  extern VECT_ARRAY(buffer_vld4, float, 16, 8, 4);
367  #endif
368  
369  /* Input buffers for vld2_lane */
370  extern VECT_VAR_DECL(buffer_vld2_lane, int, 8, 2)[2];
371  extern VECT_VAR_DECL(buffer_vld2_lane, int, 16, 2)[2];
372  extern VECT_VAR_DECL(buffer_vld2_lane, int, 32, 2)[2];
373  extern VECT_VAR_DECL(buffer_vld2_lane, int, 64, 2)[2];
374  extern VECT_VAR_DECL(buffer_vld2_lane, uint, 8, 2)[2];
375  extern VECT_VAR_DECL(buffer_vld2_lane, uint, 16, 2)[2];
376  extern VECT_VAR_DECL(buffer_vld2_lane, uint, 32, 2)[2];
377  extern VECT_VAR_DECL(buffer_vld2_lane, uint, 64, 2)[2];
378  extern VECT_VAR_DECL(buffer_vld2_lane, poly, 8, 2)[2];
379  extern VECT_VAR_DECL(buffer_vld2_lane, poly, 16, 2)[2];
380  extern VECT_VAR_DECL(buffer_vld2_lane, float, 32, 2)[2];
381  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
382  extern VECT_VAR_DECL(buffer_vld2_lane, float, 16, 2)[2];
383  #endif
384  
385  /* Input buffers for vld3_lane */
386  extern VECT_VAR_DECL(buffer_vld3_lane, int, 8, 3)[3];
387  extern VECT_VAR_DECL(buffer_vld3_lane, int, 16, 3)[3];
388  extern VECT_VAR_DECL(buffer_vld3_lane, int, 32, 3)[3];
389  extern VECT_VAR_DECL(buffer_vld3_lane, int, 64, 3)[3];
390  extern VECT_VAR_DECL(buffer_vld3_lane, uint, 8, 3)[3];
391  extern VECT_VAR_DECL(buffer_vld3_lane, uint, 16, 3)[3];
392  extern VECT_VAR_DECL(buffer_vld3_lane, uint, 32, 3)[3];
393  extern VECT_VAR_DECL(buffer_vld3_lane, uint, 64, 3)[3];
394  extern VECT_VAR_DECL(buffer_vld3_lane, poly, 8, 3)[3];
395  extern VECT_VAR_DECL(buffer_vld3_lane, poly, 16, 3)[3];
396  extern VECT_VAR_DECL(buffer_vld3_lane, float, 32, 3)[3];
397  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
398  extern VECT_VAR_DECL(buffer_vld3_lane, float, 16, 3)[3];
399  #endif
400  
401  /* Input buffers for vld4_lane */
402  extern VECT_VAR_DECL(buffer_vld4_lane, int, 8, 4)[4];
403  extern VECT_VAR_DECL(buffer_vld4_lane, int, 16, 4)[4];
404  extern VECT_VAR_DECL(buffer_vld4_lane, int, 32, 4)[4];
405  extern VECT_VAR_DECL(buffer_vld4_lane, int, 64, 4)[4];
406  extern VECT_VAR_DECL(buffer_vld4_lane, uint, 8, 4)[4];
407  extern VECT_VAR_DECL(buffer_vld4_lane, uint, 16, 4)[4];
408  extern VECT_VAR_DECL(buffer_vld4_lane, uint, 32, 4)[4];
409  extern VECT_VAR_DECL(buffer_vld4_lane, uint, 64, 4)[4];
410  extern VECT_VAR_DECL(buffer_vld4_lane, poly, 8, 4)[4];
411  extern VECT_VAR_DECL(buffer_vld4_lane, poly, 16, 4)[4];
412  extern VECT_VAR_DECL(buffer_vld4_lane, float, 32, 4)[4];
413  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
414  extern VECT_VAR_DECL(buffer_vld4_lane, float, 16, 4)[4];
415  #endif
416  
417  /* Output buffers, one of each size */
418  static ARRAY(result, int, 8, 8);
419  static ARRAY(result, int, 16, 4);
420  static ARRAY(result, int, 32, 2);
421  static ARRAY(result, int, 64, 1);
422  static ARRAY(result, uint, 8, 8);
423  static ARRAY(result, uint, 16, 4);
424  static ARRAY(result, uint, 32, 2);
425  static ARRAY(result, uint, 64, 1);
426  static ARRAY(result, poly, 8, 8);
427  static ARRAY(result, poly, 16, 4);
428  static ARRAY(result, float, 32, 2);
429  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
430  static ARRAY(result, float, 16, 4);
431  #endif
432  static ARRAY(result, int, 8, 16);
433  static ARRAY(result, int, 16, 8);
434  static ARRAY(result, int, 32, 4);
435  static ARRAY(result, int, 64, 2);
436  static ARRAY(result, uint, 8, 16);
437  static ARRAY(result, uint, 16, 8);
438  static ARRAY(result, uint, 32, 4);
439  static ARRAY(result, uint, 64, 2);
440  static ARRAY(result, poly, 8, 16);
441  static ARRAY(result, poly, 16, 8);
442  static ARRAY(result, float, 32, 4);
443  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
444  static ARRAY(result, float, 16, 8);
445  #endif
446  
447  /* Dump results (generic function) */
dump_results(char * test_name)448  static void dump_results (char *test_name)
449  {
450    int i;
451  
452    fprintf(ref_file, "\n%s output:\n", test_name);
453    fprintf(gcc_tests_file, "\n%s output:\n", test_name);
454  
455    DUMP(test_name, int, 8, 8, PRId8);
456    DUMP(test_name, int, 16, 4, PRId16);
457    DUMP(test_name, int, 32, 2, PRId32);
458    DUMP(test_name, int, 64, 1, PRId64);
459    DUMP(test_name, uint, 8, 8, PRIu8);
460    DUMP(test_name, uint, 16, 4, PRIu16);
461    DUMP(test_name, uint, 32, 2, PRIu32);
462    DUMP(test_name, uint, 64, 1, PRIu64);
463    DUMP_POLY(test_name, poly, 8, 8, PRIu8);
464    DUMP_POLY(test_name, poly, 16, 4, PRIu16);
465    DUMP_FP(test_name, float, 32, 2, PRIx32);
466  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
467    DUMP_FP16(test_name, float, 16, 4, PRIu16);
468  #endif
469  
470    DUMP(test_name, int, 8, 16, PRId8);
471    DUMP(test_name, int, 16, 8, PRId16);
472    DUMP(test_name, int, 32, 4, PRId32);
473    DUMP(test_name, int, 64, 2, PRId64);
474    DUMP(test_name, uint, 8, 16, PRIu8);
475    DUMP(test_name, uint, 16, 8, PRIu16);
476    DUMP(test_name, uint, 32, 4, PRIu32);
477    DUMP(test_name, uint, 64, 2, PRIu64);
478    DUMP_POLY(test_name, poly, 8, 16, PRIu8);
479    DUMP_POLY(test_name, poly, 16, 8, PRIu16);
480    DUMP_FP(test_name, float, 32, 4, PRIx32);
481  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
482    DUMP_FP16(test_name, float, 16, 8, PRIu16);
483  #endif
484  }
485  
486  /* Dump results in hex (generic function) */
dump_results_hex2(const char * test_name,const char * comment)487  static void dump_results_hex2 (const char *test_name, const char* comment)
488  {
489    int i;
490  
491    fprintf(ref_file, "\n%s%s output:\n", test_name, comment);
492    fprintf(gcc_tests_file, "\n%s%s output:\n", test_name, comment);
493  
494    DUMP(test_name, int, 8, 8, PRIx8);
495    DUMP(test_name, int, 16, 4, PRIx16);
496    DUMP(test_name, int, 32, 2, PRIx32);
497    DUMP(test_name, int, 64, 1, PRIx64);
498    DUMP(test_name, uint, 8, 8, PRIx8);
499    DUMP(test_name, uint, 16, 4, PRIx16);
500    DUMP(test_name, uint, 32, 2, PRIx32);
501    DUMP(test_name, uint, 64, 1, PRIx64);
502    DUMP_POLY(test_name, poly, 8, 8, PRIx8);
503    DUMP_POLY(test_name, poly, 16, 4, PRIx16);
504    DUMP_FP(test_name, float, 32, 2, PRIx32);
505  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
506    DUMP_FP16(test_name, float, 16, 4, PRIx16);
507  #endif
508  
509    DUMP(test_name, int, 8, 16, PRIx8);
510    DUMP(test_name, int, 16, 8, PRIx16);
511    DUMP(test_name, int, 32, 4, PRIx32);
512    DUMP(test_name, int, 64, 2, PRIx64);
513    DUMP(test_name, uint, 8, 16, PRIx8);
514    DUMP(test_name, uint, 16, 8, PRIx16);
515    DUMP(test_name, uint, 32, 4, PRIx32);
516    DUMP(test_name, uint, 64, 2, PRIx64);
517    DUMP_POLY(test_name, poly, 8, 16, PRIx8);
518    DUMP_POLY(test_name, poly, 16, 8, PRIx16);
519    DUMP_FP(test_name, float, 32, 4, PRIx32);
520  #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
521    DUMP_FP16(test_name, float, 16, 8, PRIx16);
522  #endif
523  }
524  
dump_results_hex(const char * test_name)525  static void dump_results_hex (const char *test_name)
526  {
527    dump_results_hex2(test_name, "");
528  }
529  
530  #ifndef STM_ARM_NEON_MODELS
531  
532  /* This hack is to cope with various compilers/libc which may not
533     provide endian.h or cross-compilers such as llvm which includes the
534     host's endian.h.  */
535  #ifndef __arm__
536  #include <endian.h>
537  #define THIS_ENDIAN __BYTE_ORDER
538  #else /* __arm__ */
539  #ifdef __ARMEL__
540  #define THIS_ENDIAN __LITTLE_ENDIAN
541  #else /* __ARMEL__ */
542  #define THIS_ENDIAN __BIG_ENDIAN
543  #endif
544  #endif /* __arm__ */
545  
546  #if THIS_ENDIAN == __LITTLE_ENDIAN
547  
548  typedef union {
549    struct {
550      int _xxx:27;
551      unsigned int QC:1;
552      int V:1;
553      int C:1;
554      int Z:1;
555      int N:1;
556    } b;
557    unsigned int word;
558  } _ARM_FPSCR;
559  
560  #else /* __BIG_ENDIAN */
561  
562  typedef union {
563    struct {
564      int N:1;
565      int Z:1;
566      int C:1;
567      int V:1;
568      unsigned int QC:1;
569      int _dnm:27;
570    } b;
571    unsigned int word;
572  } _ARM_FPSCR;
573  
574  #endif /* __BIG_ENDIAN */
575  
576  #ifdef __ARMCC_VERSION
577  register _ARM_FPSCR _afpscr_for_qc __asm("fpscr");
578  # define Neon_Cumulative_Sat _afpscr_for_qc.b.QC
579  # define Set_Neon_Cumulative_Sat(x, depend)  {Neon_Cumulative_Sat = (x);}
580  #else
581  /* GCC/ARM does not know this register */
582  # define Neon_Cumulative_Sat  __read_neon_cumulative_sat()
583  /* We need a fake dependency to ensure correct ordering of asm
584     statements to preset the QC flag value, and Neon operators writing
585     to QC. */
586  #define Set_Neon_Cumulative_Sat(x, depend)	\
587    __set_neon_cumulative_sat((x), (depend))
588  
589  # if defined(__aarch64__)
__read_neon_cumulative_sat(void)590  static volatile int __read_neon_cumulative_sat (void) {
591      _ARM_FPSCR _afpscr_for_qc;
592      asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
593      return _afpscr_for_qc.b.QC;
594  }
595  
596  #define __set_neon_cumulative_sat(x, depend) {				\
597      _ARM_FPSCR _afpscr_for_qc;						\
598      asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));		\
599      _afpscr_for_qc.b.QC = x;						\
600      asm volatile ("msr fpsr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \
601    }
602  
603  # else
__read_neon_cumulative_sat(void)604  static volatile int __read_neon_cumulative_sat (void) {
605      _ARM_FPSCR _afpscr_for_qc;
606      asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc));
607      return _afpscr_for_qc.b.QC;
608  }
609  
610  #define __set_neon_cumulative_sat(x, depend) {				\
611      _ARM_FPSCR _afpscr_for_qc;						\
612      asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc));		\
613      _afpscr_for_qc.b.QC = x;						\
614      asm volatile ("vmsr fpscr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \
615    }
616  
617  # endif
618  #endif
619  
620  #endif /* STM_ARM_NEON_MODELS */
621  
dump_neon_cumulative_sat(const char * msg,const char * name,const char * t1,int w,int n)622  static void dump_neon_cumulative_sat(const char* msg, const char *name,
623  				     const char* t1, int w, int n)
624  {
625    fprintf(ref_file, "%s:%d:%s Neon cumulative saturation %d\n", msg, result_idx++,
626  	  name, Neon_Cumulative_Sat);
627    fprintf(gcc_tests_file,
628  	  "int VECT_VAR(expected_cumulative_sat,%s,%d,%d) = %d;\n",
629  	  t1, w, n, Neon_Cumulative_Sat);
630  }
631  
632  /* Clean output buffers before execution */
clean_results(void)633  static void clean_results (void)
634  {
635    result_idx = 0;
636    CLEAN(result, int, 8, 8);
637    CLEAN(result, int, 16, 4);
638    CLEAN(result, int, 32, 2);
639    CLEAN(result, int, 64, 1);
640    CLEAN(result, uint, 8, 8);
641    CLEAN(result, uint, 16, 4);
642    CLEAN(result, uint, 32, 2);
643    CLEAN(result, uint, 64, 1);
644    CLEAN(result, poly, 8, 8);
645    CLEAN(result, poly, 16, 4);
646    CLEAN(result, float, 32, 2);
647  
648    CLEAN(result, int, 8, 16);
649    CLEAN(result, int, 16, 8);
650    CLEAN(result, int, 32, 4);
651    CLEAN(result, int, 64, 2);
652    CLEAN(result, uint, 8, 16);
653    CLEAN(result, uint, 16, 8);
654    CLEAN(result, uint, 32, 4);
655    CLEAN(result, uint, 64, 2);
656    CLEAN(result, poly, 8, 16);
657    CLEAN(result, poly, 16, 8);
658    CLEAN(result, float, 32, 4);
659  }
660  
661  
662  /* Helpers to declare variables of various types  */
663  #define DECL_VARIABLE(VAR, T1, W, N)		\
664    volatile VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N)
665  
666  #define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR)	\
667    DECL_VARIABLE(VAR, int, 8, 8);			\
668    DECL_VARIABLE(VAR, int, 16, 4);			\
669    DECL_VARIABLE(VAR, int, 32, 2);			\
670    DECL_VARIABLE(VAR, int, 64, 1)
671  
672  #define DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR)	\
673    DECL_VARIABLE(VAR, uint, 8, 8);			\
674    DECL_VARIABLE(VAR, uint, 16, 4);			\
675    DECL_VARIABLE(VAR, uint, 32, 2);			\
676    DECL_VARIABLE(VAR, uint, 64, 1)
677  
678  #define DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR)	\
679    DECL_VARIABLE(VAR, int, 8, 16);			\
680    DECL_VARIABLE(VAR, int, 16, 8);			\
681    DECL_VARIABLE(VAR, int, 32, 4);			\
682    DECL_VARIABLE(VAR, int, 64, 2)
683  
684  #define DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR)	\
685    DECL_VARIABLE(VAR, uint, 8, 16);			\
686    DECL_VARIABLE(VAR, uint, 16, 8);			\
687    DECL_VARIABLE(VAR, uint, 32, 4);			\
688    DECL_VARIABLE(VAR, uint, 64, 2)
689  
690  #define DECL_VARIABLE_64BITS_VARIANTS(VAR)	\
691    DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR);	\
692    DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
693    DECL_VARIABLE(VAR, poly, 8, 8);		\
694    DECL_VARIABLE(VAR, poly, 16, 4);		\
695    DECL_VARIABLE(VAR, float, 32, 2)
696  
697  #define DECL_VARIABLE_128BITS_VARIANTS(VAR)	\
698    DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR);	\
699    DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
700    DECL_VARIABLE(VAR, poly, 8, 16);		\
701    DECL_VARIABLE(VAR, poly, 16, 8);		\
702    DECL_VARIABLE(VAR, float, 32, 4)
703  
704  #define DECL_VARIABLE_ALL_VARIANTS(VAR)		\
705    DECL_VARIABLE_64BITS_VARIANTS(VAR);		\
706    DECL_VARIABLE_128BITS_VARIANTS(VAR)
707  
708  #define DECL_VARIABLE_SIGNED_VARIANTS(VAR)	\
709    DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR);	\
710    DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR)
711  
712  #define DECL_VARIABLE_UNSIGNED_VARIANTS(VAR)	\
713    DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
714    DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR)
715  
716  /* Helpers to initialize vectors */
717  #define VDUP(VAR, Q, T1, T2, W, N, V)		\
718    VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)
719  
720  #define TEST_VSET_LANE(VAR, Q, T1, T2, W, N, L, V)			\
721    VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V,			\
722  						   VECT_VAR(VAR, T1, W, N), \
723  						   L)
724  
725  /* We need to load initial values first, so rely on VLD1 */
726  #define VLOAD(VAR, BUF, Q, T1, T2, W, N)				\
727    VECT_VAR(VAR, T1, W, N) = vld1##Q##_##T2##W(VECT_VAR(BUF, T1, W, N))
728  
729  /* Helpers for macros with 1 constant and 5 variable arguments */
730  #define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
731    MACRO(VAR, , int, s, 8, 8);					\
732    MACRO(VAR, , int, s, 16, 4);					\
733    MACRO(VAR, , int, s, 32, 2);					\
734    MACRO(VAR, , int, s, 64, 1)
735  
736  #define TEST_MACRO_64BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)	\
737    MACRO(VAR, , uint, u, 8, 8);					\
738    MACRO(VAR, , uint, u, 16, 4);					\
739    MACRO(VAR, , uint, u, 32, 2);					\
740    MACRO(VAR, , uint, u, 64, 1)
741  
742  #define TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
743    MACRO(VAR, q, int, s, 8, 16);					\
744    MACRO(VAR, q, int, s, 16, 8);					\
745    MACRO(VAR, q, int, s, 32, 4);					\
746    MACRO(VAR, q, int, s, 64, 2)
747  
748  #define TEST_MACRO_128BITS_UNSIGNED_VARIANTS_1_5(MACRO,VAR)	\
749    MACRO(VAR, q, uint, u, 8, 16);				\
750    MACRO(VAR, q, uint, u, 16, 8);				\
751    MACRO(VAR, q, uint, u, 32, 4);				\
752    MACRO(VAR, q, uint, u, 64, 2)
753  
754  #define TEST_MACRO_64BITS_VARIANTS_1_5(MACRO, VAR)	\
755    TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
756    TEST_MACRO_64BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)
757  
758  #define TEST_MACRO_128BITS_VARIANTS_1_5(MACRO, VAR)	\
759    TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
760    TEST_MACRO_128BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)
761  
762  #define TEST_MACRO_ALL_VARIANTS_1_5(MACRO, VAR)	\
763    TEST_MACRO_64BITS_VARIANTS_1_5(MACRO, VAR);	\
764    TEST_MACRO_128BITS_VARIANTS_1_5(MACRO, VAR)
765  
766  #define TEST_MACRO_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
767    TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
768    TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)
769  
770  /* Helpers for macros with 2 constant and 5 variable arguments */
771  #define TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
772    MACRO(VAR1, VAR2, , int, s, 8, 8);					\
773    MACRO(VAR1, VAR2, , int, s, 16, 4);					\
774    MACRO(VAR1, VAR2, , int, s, 32, 2);					\
775    MACRO(VAR1, VAR2 , , int, s, 64, 1)
776  
777  #define TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
778    MACRO(VAR1, VAR2, , uint, u, 8, 8);					\
779    MACRO(VAR1, VAR2, , uint, u, 16, 4);					\
780    MACRO(VAR1, VAR2, , uint, u, 32, 2);					\
781    MACRO(VAR1, VAR2, , uint, u, 64, 1)
782  
783  #define TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
784    MACRO(VAR1, VAR2, q, int, s, 8, 16);					\
785    MACRO(VAR1, VAR2, q, int, s, 16, 8);					\
786    MACRO(VAR1, VAR2, q, int, s, 32, 4);					\
787    MACRO(VAR1, VAR2, q, int, s, 64, 2)
788  
789  #define TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
790    MACRO(VAR1, VAR2, q, uint, u, 8, 16);					\
791    MACRO(VAR1, VAR2, q, uint, u, 16, 8);					\
792    MACRO(VAR1, VAR2, q, uint, u, 32, 4);					\
793    MACRO(VAR1, VAR2, q, uint, u, 64, 2)
794  
795  #define TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
796    TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
797    TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
798    MACRO(VAR1, VAR2, , poly, p, 8, 8);				\
799    MACRO(VAR1, VAR2, , poly, p, 16, 4)
800  
801  #define TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
802    TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
803    TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
804    MACRO(VAR1, VAR2, q, poly, p, 8, 16);				\
805    MACRO(VAR1, VAR2, q, poly, p, 16, 8)
806  
807  #define TEST_MACRO_ALL_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
808    TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
809    TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)
810  
811  #define TEST_MACRO_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
812    TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
813    TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)
814  
815  #endif /* _STM_ARM_NEON_REF_H_ */
816