1 /*
2  * Mesa 3-D graphics library
3  * Version:  6.1
4  *
5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Gareth Hughes
26  */
27 
28 #ifndef __M_DEBUG_UTIL_H__
29 #define __M_DEBUG_UTIL_H__
30 
31 
32 #ifdef DEBUG_MATH  /* This code only used for debugging */
33 
34 
35 /* Comment this out to deactivate the cycle counter.
36  * NOTE: it works only on CPUs which know the 'rdtsc' command (586 or higher)
37  * (hope, you don't try to debug Mesa on a 386 ;)
38  */
39 #if defined(__GNUC__) && \
40     ((defined(__i386__) && defined(USE_X86_ASM)) || \
41      (defined(__sparc__) && defined(USE_SPARC_ASM)))
42 #define  RUN_DEBUG_BENCHMARK
43 #endif
44 
45 #define TEST_COUNT		128	/* size of the tested vector array   */
46 
47 #define REQUIRED_PRECISION	10	/* allow 4 bits to miss              */
48 #define MAX_PRECISION		24	/* max. precision possible           */
49 
50 
51 #ifdef  RUN_DEBUG_BENCHMARK
52 /* Overhead of profiling counter in cycles.  Automatically adjusted to
53  * your machine at run time - counter initialization should give very
54  * consistent results.
55  */
56 extern long counter_overhead;
57 
58 /* This is the value of the environment variable MESA_PROFILE, and is
59  * used to determine if we should benchmark the functions as well as
60  * verify their correctness.
61  */
62 extern char *mesa_profile;
63 
64 /* Modify the number of tests if you like.
65  * We take the minimum of all results, because every error should be
66  * positive (time used by other processes, task switches etc).
67  * It is assumed that all calculations are done in the cache.
68  */
69 
70 #if defined(__i386__)
71 
72 #if 1 /* PPro, PII, PIII version */
73 
74 /* Profiling on the P6 architecture requires a little more work, due to
75  * the internal out-of-order execution.  We must perform a serializing
76  * 'cpuid' instruction before and after the 'rdtsc' instructions to make
77  * sure no other uops are executed when we sample the timestamp counter.
78  */
79 #define  INIT_COUNTER()							\
80    do {									\
81       int cycle_i;							\
82       counter_overhead = LONG_MAX;					\
83       for ( cycle_i = 0 ; cycle_i < 8 ; cycle_i++ ) {			\
84 	 long cycle_tmp1 = 0, cycle_tmp2 = 0;				\
85 	 __asm__ __volatile__ ( "push %%ebx       \n"			\
86 				"xor %%eax, %%eax \n"			\
87 				"cpuid            \n"			\
88 				"rdtsc            \n"			\
89 				"mov %%eax, %0    \n"			\
90 				"xor %%eax, %%eax \n"			\
91 				"cpuid            \n"			\
92 				"pop %%ebx        \n"			\
93 				"push %%ebx       \n"			\
94 				"xor %%eax, %%eax \n"			\
95 				"cpuid            \n"			\
96 				"rdtsc            \n"			\
97 				"mov %%eax, %1    \n"			\
98 				"xor %%eax, %%eax \n"			\
99 				"cpuid            \n"			\
100 				"pop %%ebx        \n"			\
101 				: "=m" (cycle_tmp1), "=m" (cycle_tmp2)	\
102 				: : "eax", "ecx", "edx" );		\
103 	 if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) {		\
104 	    counter_overhead = cycle_tmp2 - cycle_tmp1;			\
105 	 }								\
106       }									\
107    } while (0)
108 
109 #define  BEGIN_RACE(x)							\
110    x = LONG_MAX;							\
111    for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) {			\
112       long cycle_tmp1 = 0, cycle_tmp2 = 0;				\
113       __asm__ __volatile__ ( "push %%ebx       \n"			\
114 			     "xor %%eax, %%eax \n"			\
115 			     "cpuid            \n"			\
116 			     "rdtsc            \n"			\
117 			     "mov %%eax, %0    \n"			\
118 			     "xor %%eax, %%eax \n"			\
119 			     "cpuid            \n"			\
120 			     "pop %%ebx        \n"			\
121 			     : "=m" (cycle_tmp1)			\
122 			     : : "eax", "ecx", "edx" );
123 
124 #define END_RACE(x)							\
125       __asm__ __volatile__ ( "push %%ebx       \n"			\
126 			     "xor %%eax, %%eax \n"			\
127 			     "cpuid            \n"			\
128 			     "rdtsc            \n"			\
129 			     "mov %%eax, %0    \n"			\
130 			     "xor %%eax, %%eax \n"			\
131 			     "cpuid            \n"			\
132 			     "pop %%ebx        \n"			\
133 			     : "=m" (cycle_tmp2)			\
134 			     : : "eax", "ecx", "edx" );			\
135       if ( x > (cycle_tmp2 - cycle_tmp1) ) {				\
136 	 x = cycle_tmp2 - cycle_tmp1;					\
137       }									\
138    }									\
139    x -= counter_overhead;
140 
141 #else /* PPlain, PMMX version */
142 
143 /* To ensure accurate results, we stall the pipelines with the
144  * non-pairable 'cdq' instruction.  This ensures all the code being
145  * profiled is complete when the 'rdtsc' instruction executes.
146  */
147 #define  INIT_COUNTER(x)						\
148    do {									\
149       int cycle_i;							\
150       x = LONG_MAX;							\
151       for ( cycle_i = 0 ; cycle_i < 32 ; cycle_i++ ) {			\
152 	 long cycle_tmp1, cycle_tmp2, dummy;				\
153 	 __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp1) );		\
154 	 __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp2) );		\
155 	 __asm__ ( "cdq" );						\
156 	 __asm__ ( "cdq" );						\
157 	 __asm__ ( "rdtsc" : "=a" (cycle_tmp1), "=d" (dummy) );		\
158 	 __asm__ ( "cdq" );						\
159 	 __asm__ ( "cdq" );						\
160 	 __asm__ ( "rdtsc" : "=a" (cycle_tmp2), "=d" (dummy) );		\
161 	 if ( x > (cycle_tmp2 - cycle_tmp1) )				\
162 	    x = cycle_tmp2 - cycle_tmp1;				\
163       }									\
164    } while (0)
165 
166 #define  BEGIN_RACE(x)							\
167    x = LONG_MAX;							\
168    for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) {			\
169       long cycle_tmp1, cycle_tmp2, dummy;				\
170       __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp1) );			\
171       __asm__ ( "mov %%eax, %0" : "=a" (cycle_tmp2) );			\
172       __asm__ ( "cdq" );						\
173       __asm__ ( "cdq" );						\
174       __asm__ ( "rdtsc" : "=a" (cycle_tmp1), "=d" (dummy) );
175 
176 
177 #define END_RACE(x)							\
178       __asm__ ( "cdq" );						\
179       __asm__ ( "cdq" );						\
180       __asm__ ( "rdtsc" : "=a" (cycle_tmp2), "=d" (dummy) );		\
181       if ( x > (cycle_tmp2 - cycle_tmp1) )				\
182 	 x = cycle_tmp2 - cycle_tmp1;					\
183    }									\
184    x -= counter_overhead;
185 
186 #endif
187 
188 #elif defined(__x86_64__)
189 
190 #define rdtscll(val) do { \
191      unsigned int a,d; \
192      __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); \
193      (val) = ((unsigned long)a) | (((unsigned long)d)<<32); \
194 } while(0)
195 
196 /* Copied from i386 PIII version */
197 #define  INIT_COUNTER()							\
198    do {									\
199       int cycle_i;							\
200       counter_overhead = LONG_MAX;					\
201       for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) {			\
202 	 unsigned long cycle_tmp1, cycle_tmp2;        			\
203 	 rdtscll(cycle_tmp1);						\
204 	 rdtscll(cycle_tmp2);						\
205 	 if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) {		\
206 	    counter_overhead = cycle_tmp2 - cycle_tmp1;			\
207 	 }								\
208       }									\
209    } while (0)
210 
211 
212 #define  BEGIN_RACE(x)							\
213    x = LONG_MAX;							\
214    for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) {			\
215       unsigned long cycle_tmp1, cycle_tmp2;				\
216       rdtscll(cycle_tmp1);						\
217 
218 #define END_RACE(x)							\
219       rdtscll(cycle_tmp2);						\
220       if ( x > (cycle_tmp2 - cycle_tmp1) ) {				\
221 	 x = cycle_tmp2 - cycle_tmp1;					\
222       }									\
223    }									\
224    x -= counter_overhead;
225 
226 #elif defined(__sparc__)
227 
228 #define  INIT_COUNTER()	\
229 	 do { counter_overhead = 5; } while(0)
230 
231 #define  BEGIN_RACE(x)                                                        \
232 x = LONG_MAX;                                                                 \
233 for (cycle_i = 0; cycle_i <10; cycle_i++) {                                   \
234    register long cycle_tmp1 __asm__("l0");				      \
235    register long cycle_tmp2 __asm__("l1");				      \
236    /* rd %tick, %l0 */							      \
237    __asm__ __volatile__ (".word 0xa1410000" : "=r" (cycle_tmp1));  /*  save timestamp   */
238 
239 #define END_RACE(x)                                                           \
240    /* rd %tick, %l1 */							      \
241    __asm__ __volatile__ (".word 0xa3410000" : "=r" (cycle_tmp2));	      \
242    if (x > (cycle_tmp2-cycle_tmp1)) x = cycle_tmp2 - cycle_tmp1;              \
243 }                                                                             \
244 x -= counter_overhead;
245 
246 #else
247 #error Your processor is not supported for RUN_XFORM_BENCHMARK
248 #endif
249 
250 #else
251 
252 #define BEGIN_RACE(x)
253 #define END_RACE(x)
254 
255 #endif
256 
257 
258 /* =============================================================
259  * Helper functions
260  */
261 
rnd(void)262 static GLfloat rnd( void )
263 {
264    GLfloat f = (GLfloat)rand() / (GLfloat)RAND_MAX;
265    GLfloat gran = (GLfloat)(1 << 13);
266 
267    f = (GLfloat)(GLint)(f * gran) / gran;
268 
269    return f * 2.0 - 1.0;
270 }
271 
significand_match(GLfloat a,GLfloat b)272 static int significand_match( GLfloat a, GLfloat b )
273 {
274    GLfloat d = a - b;
275    int a_ex, b_ex, d_ex;
276 
277    if ( d == 0.0F ) {
278       return MAX_PRECISION;   /* Exact match */
279    }
280 
281    if ( a == 0.0F || b == 0.0F ) {
282       /* It would probably be better to check if the
283        * non-zero number is denormalized and return
284        * the index of the highest set bit here.
285        */
286       return 0;
287    }
288 
289    FREXPF( a, &a_ex );
290    FREXPF( b, &b_ex );
291    FREXPF( d, &d_ex );
292 
293    if ( a_ex < b_ex ) {
294       return a_ex - d_ex;
295    } else {
296       return b_ex - d_ex;
297    }
298 }
299 
300 enum { NIL = 0, ONE = 1, NEG = -1, VAR = 2 };
301 
302 /* Ensure our arrays are correctly aligned.
303  */
304 #if defined(__GNUC__)
305 #  define ALIGN16(type, array)	type array __attribute__ ((aligned (16)))
306 #elif defined(_MSC_VER)
307 #  define ALIGN16(type, array)	type array __declspec(align(16)) /* GH: Does this work? */
308 #elif defined(__WATCOMC__)
309 #  define ALIGN16(type, array)	                    /* Watcom does not support this */
310 #elif defined(__xlC__)
311 #  define ALIGN16(type, array)       type __align (16) array
312 #else
313 #  warning "ALIGN16 will not 16-byte align!\n"
314 #  define ALIGN16
315 #endif
316 
317 
318 #endif /* DEBUG_MATH */
319 
320 #endif /* __M_DEBUG_UTIL_H__ */
321