1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2    Copyright (C) 1991-2014 Free Software Foundation, Inc.
3 
4    This file is part of the GNU C Library.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    In addition to the permissions in the GNU Lesser General Public
12    License, the Free Software Foundation gives you unlimited
13    permission to link the compiled version of this file into
14    combinations with other programs, and to distribute those
15    combinations without any restriction coming from the use of this
16    file.  (The Lesser General Public License restrictions do apply in
17    other respects; for example, they cover modification of the file,
18    and distribution when not linked into a combine executable.)
19 
20    The GNU C Library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24 
25    You should have received a copy of the GNU Lesser General Public
26    License along with the GNU C Library; if not, see
27    <http://www.gnu.org/licenses/>.  */
28 
29 /* You have to define the following before including this file:
30 
31    UWtype -- An unsigned type, default type for operations (typically a "word")
32    UHWtype -- An unsigned type, at least half the size of UWtype.
33    UDWtype -- An unsigned type, at least twice as large a UWtype
34    W_TYPE_SIZE -- size in bits of UWtype
35 
36    UQItype -- Unsigned 8 bit type.
37    SItype, USItype -- Signed and unsigned 32 bit types.
38    DItype, UDItype -- Signed and unsigned 64 bit types.
39 
40    On a 32 bit machine UWtype should typically be USItype;
41    on a 64 bit machine, UWtype should typically be UDItype.  */
42 
43 #define __BITS4 (W_TYPE_SIZE / 4)
44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
47 
48 #ifndef W_TYPE_SIZE
49 #define W_TYPE_SIZE	32
50 #define UWtype		USItype
51 #define UHWtype		USItype
52 #define UDWtype		UDItype
53 #endif
54 
55 /* Used in glibc only.  */
56 #ifndef attribute_hidden
57 #define attribute_hidden
58 #endif
59 
60 extern const UQItype __clz_tab[256] attribute_hidden;
61 
62 /* Define auxiliary asm macros.
63 
64    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
65    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
66    word product in HIGH_PROD and LOW_PROD.
67 
68    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
69    UDWtype product.  This is just a variant of umul_ppmm.
70 
71    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
72    denominator) divides a UDWtype, composed by the UWtype integers
73    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
74    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
75    than DENOMINATOR for correct operation.  If, in addition, the most
76    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
77    UDIV_NEEDS_NORMALIZATION is defined to 1.
78 
79    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
80    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
81    is rounded towards 0.
82 
83    5) count_leading_zeros(count, x) counts the number of zero-bits from the
84    msb to the first nonzero bit in the UWtype X.  This is the number of
85    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
86    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
87 
88    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
89    from the least significant end.
90 
91    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
92    high_addend_2, low_addend_2) adds two UWtype integers, composed by
93    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
94    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
95    (i.e. carry out) is not stored anywhere, and is lost.
96 
97    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
98    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
99    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
100    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
101    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
102    and is lost.
103 
104    If any of these macros are left undefined for a particular CPU,
105    C macros are used.  */
106 
107 /* The CPUs come in alphabetical order below.
108 
109    Please add support for more CPUs here, or improve the current support
110    for the CPUs below!
111    (E.g. WE32100, IBM360.)  */
112 
113 #if defined (__GNUC__) && !defined (NO_ASM)
114 
115 /* We sometimes need to clobber "cc" with gcc2, but that would not be
116    understood by gcc1.  Use cpp to avoid major code duplication.  */
117 #if __GNUC__ < 2
118 #define __CLOBBER_CC
119 #define __AND_CLOBBER_CC
120 #else /* __GNUC__ >= 2 */
121 #define __CLOBBER_CC : "cc"
122 #define __AND_CLOBBER_CC , "cc"
123 #endif /* __GNUC__ < 2 */
124 
125 #if defined (__aarch64__)
126 
127 #if W_TYPE_SIZE == 32
128 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
129 #define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctz (X))
130 #define COUNT_LEADING_ZEROS_0 32
131 #endif /* W_TYPE_SIZE == 32 */
132 
133 #if W_TYPE_SIZE == 64
134 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clzll (X))
135 #define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctzll (X))
136 #define COUNT_LEADING_ZEROS_0 64
137 #endif /* W_TYPE_SIZE == 64 */
138 
139 #endif /* __aarch64__ */
140 
141 #if defined (__alpha) && W_TYPE_SIZE == 64
142 #define umul_ppmm(ph, pl, m0, m1) \
143   do {									\
144     UDItype __m0 = (m0), __m1 = (m1);					\
145     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
146     (pl) = __m0 * __m1;							\
147   } while (0)
148 #define UMUL_TIME 46
149 #ifndef LONGLONG_STANDALONE
150 #define udiv_qrnnd(q, r, n1, n0, d) \
151   do { UDItype __r;							\
152     (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
153     (r) = __r;								\
154   } while (0)
155 extern UDItype __udiv_qrnnd (UDItype *, UDItype, UDItype, UDItype);
156 #define UDIV_TIME 220
157 #endif /* LONGLONG_STANDALONE */
158 #ifdef __alpha_cix__
159 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clzl (X))
160 #define count_trailing_zeros(COUNT,X)	((COUNT) = __builtin_ctzl (X))
161 #define COUNT_LEADING_ZEROS_0 64
162 #else
163 #define count_leading_zeros(COUNT,X) \
164   do {									\
165     UDItype __xr = (X), __t, __a;					\
166     __t = __builtin_alpha_cmpbge (0, __xr);				\
167     __a = __clz_tab[__t ^ 0xff] - 1;					\
168     __t = __builtin_alpha_extbl (__xr, __a);				\
169     (COUNT) = 64 - (__clz_tab[__t] + __a*8);				\
170   } while (0)
171 #define count_trailing_zeros(COUNT,X) \
172   do {									\
173     UDItype __xr = (X), __t, __a;					\
174     __t = __builtin_alpha_cmpbge (0, __xr);				\
175     __t = ~__t & -~__t;							\
176     __a = ((__t & 0xCC) != 0) * 2;					\
177     __a += ((__t & 0xF0) != 0) * 4;					\
178     __a += ((__t & 0xAA) != 0);						\
179     __t = __builtin_alpha_extbl (__xr, __a);				\
180     __a <<= 3;								\
181     __t &= -__t;							\
182     __a += ((__t & 0xCC) != 0) * 2;					\
183     __a += ((__t & 0xF0) != 0) * 4;					\
184     __a += ((__t & 0xAA) != 0);						\
185     (COUNT) = __a;							\
186   } while (0)
187 #endif /* __alpha_cix__ */
188 #endif /* __alpha */
189 
190 #if defined (__arc__) && W_TYPE_SIZE == 32
191 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
192   __asm__ ("add.f	%1, %4, %5\n\tadc	%0, %2, %3"		\
193 	   : "=r" ((USItype) (sh)),					\
194 	     "=&r" ((USItype) (sl))					\
195 	   : "%r" ((USItype) (ah)),					\
196 	     "rIJ" ((USItype) (bh)),					\
197 	     "%r" ((USItype) (al)),					\
198 	     "rIJ" ((USItype) (bl)))
199 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
200   __asm__ ("sub.f	%1, %4, %5\n\tsbc	%0, %2, %3"		\
201 	   : "=r" ((USItype) (sh)),					\
202 	     "=&r" ((USItype) (sl))					\
203 	   : "r" ((USItype) (ah)),					\
204 	     "rIJ" ((USItype) (bh)),					\
205 	     "r" ((USItype) (al)),					\
206 	     "rIJ" ((USItype) (bl)))
207 
208 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
209 #ifdef __ARC_NORM__
210 #define count_leading_zeros(count, x) \
211   do									\
212     {									\
213       SItype c_;							\
214 									\
215       __asm__ ("norm.f\t%0,%1\n\tmov.mi\t%0,-1" : "=r" (c_) : "r" (x) : "cc");\
216       (count) = c_ + 1;							\
217     }									\
218   while (0)
219 #define COUNT_LEADING_ZEROS_0 32
220 #endif
221 #endif
222 
223 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
224  && W_TYPE_SIZE == 32
225 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
226   __asm__ ("adds	%1, %4, %5\n\tadc	%0, %2, %3"		\
227 	   : "=r" ((USItype) (sh)),					\
228 	     "=&r" ((USItype) (sl))					\
229 	   : "%r" ((USItype) (ah)),					\
230 	     "rI" ((USItype) (bh)),					\
231 	     "%r" ((USItype) (al)),					\
232 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
233 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
234   __asm__ ("subs	%1, %4, %5\n\tsbc	%0, %2, %3"		\
235 	   : "=r" ((USItype) (sh)),					\
236 	     "=&r" ((USItype) (sl))					\
237 	   : "r" ((USItype) (ah)),					\
238 	     "rI" ((USItype) (bh)),					\
239 	     "r" ((USItype) (al)),					\
240 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
241 # if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
242      || defined(__ARM_ARCH_3__)
243 #  define umul_ppmm(xh, xl, a, b)					\
244   do {									\
245     register USItype __t0, __t1, __t2;					\
246     __asm__ ("%@ Inlined umul_ppmm\n"					\
247 	   "	mov	%2, %5, lsr #16\n"				\
248 	   "	mov	%0, %6, lsr #16\n"				\
249 	   "	bic	%3, %5, %2, lsl #16\n"				\
250 	   "	bic	%4, %6, %0, lsl #16\n"				\
251 	   "	mul	%1, %3, %4\n"					\
252 	   "	mul	%4, %2, %4\n"					\
253 	   "	mul	%3, %0, %3\n"					\
254 	   "	mul	%0, %2, %0\n"					\
255 	   "	adds	%3, %4, %3\n"					\
256 	   "	addcs	%0, %0, #65536\n"				\
257 	   "	adds	%1, %1, %3, lsl #16\n"				\
258 	   "	adc	%0, %0, %3, lsr #16"				\
259 	   : "=&r" ((USItype) (xh)),					\
260 	     "=r" ((USItype) (xl)),					\
261 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
262 	   : "r" ((USItype) (a)),					\
263 	     "r" ((USItype) (b)) __CLOBBER_CC );			\
264   } while (0)
265 #  define UMUL_TIME 20
266 # else
267 #  define umul_ppmm(xh, xl, a, b)					\
268   do {									\
269     /* Generate umull, under compiler control.  */			\
270     register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);	\
271     (xl) = (USItype)__t0;						\
272     (xh) = (USItype)(__t0 >> 32);					\
273   } while (0)
274 #  define UMUL_TIME 3
275 # endif
276 # define UDIV_TIME 100
277 #endif /* __arm__ */
278 
279 #if defined(__arm__)
280 /* Let gcc decide how best to implement count_leading_zeros.  */
281 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
282 #define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
283 #define COUNT_LEADING_ZEROS_0 32
284 #endif
285 
286 #if defined (__AVR__)
287 
288 #if W_TYPE_SIZE == 16
289 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clz (X))
290 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X))
291 #define COUNT_LEADING_ZEROS_0 16
292 #endif /* W_TYPE_SIZE == 16 */
293 
294 #if W_TYPE_SIZE == 32
295 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzl (X))
296 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzl (X))
297 #define COUNT_LEADING_ZEROS_0 32
298 #endif /* W_TYPE_SIZE == 32 */
299 
300 #if W_TYPE_SIZE == 64
301 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzll (X))
302 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzll (X))
303 #define COUNT_LEADING_ZEROS_0 64
304 #endif /* W_TYPE_SIZE == 64 */
305 
306 #endif /* defined (__AVR__) */
307 
308 #if defined (__CRIS__)
309 
310 #if __CRIS_arch_version >= 3
311 #define count_leading_zeros(COUNT, X) ((COUNT) = __builtin_clz (X))
312 #define COUNT_LEADING_ZEROS_0 32
313 #endif /* __CRIS_arch_version >= 3 */
314 
315 #if __CRIS_arch_version >= 8
316 #define count_trailing_zeros(COUNT, X) ((COUNT) = __builtin_ctz (X))
317 #endif /* __CRIS_arch_version >= 8 */
318 
319 #if __CRIS_arch_version >= 10
320 #define __umulsidi3(u,v) ((UDItype)(USItype) (u) * (UDItype)(USItype) (v))
321 #else
322 #define __umulsidi3 __umulsidi3
323 extern UDItype __umulsidi3 (USItype, USItype);
324 #endif /* __CRIS_arch_version >= 10 */
325 
326 #define umul_ppmm(w1, w0, u, v)		\
327   do {					\
328     UDItype __x = __umulsidi3 (u, v);	\
329     (w0) = (USItype) (__x);		\
330     (w1) = (USItype) (__x >> 32);	\
331   } while (0)
332 
333 /* FIXME: defining add_ssaaaa and sub_ddmmss should be advantageous for
334    DFmode ("double" intrinsics, avoiding two of the three insns handling
335    carry), but defining them as open-code C composing and doing the
336    operation in DImode (UDImode) shows that the DImode needs work:
337    register pressure from requiring neighboring registers and the
338    traffic to and from them come to dominate, in the 4.7 series.  */
339 
340 #endif /* defined (__CRIS__) */
341 
342 #if defined (__hppa) && W_TYPE_SIZE == 32
343 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
344   __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0"				\
345 	   : "=r" ((USItype) (sh)),					\
346 	     "=&r" ((USItype) (sl))					\
347 	   : "%rM" ((USItype) (ah)),					\
348 	     "rM" ((USItype) (bh)),					\
349 	     "%rM" ((USItype) (al)),					\
350 	     "rM" ((USItype) (bl)))
351 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
352   __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0"				\
353 	   : "=r" ((USItype) (sh)),					\
354 	     "=&r" ((USItype) (sl))					\
355 	   : "rM" ((USItype) (ah)),					\
356 	     "rM" ((USItype) (bh)),					\
357 	     "rM" ((USItype) (al)),					\
358 	     "rM" ((USItype) (bl)))
359 #if defined (_PA_RISC1_1)
360 #define umul_ppmm(w1, w0, u, v) \
361   do {									\
362     union								\
363       {									\
364 	UDItype __f;							\
365 	struct {USItype __w1, __w0;} __w1w0;				\
366       } __t;								\
367     __asm__ ("xmpyu %1,%2,%0"						\
368 	     : "=x" (__t.__f)						\
369 	     : "x" ((USItype) (u)),					\
370 	       "x" ((USItype) (v)));					\
371     (w1) = __t.__w1w0.__w1;						\
372     (w0) = __t.__w1w0.__w0;						\
373      } while (0)
374 #define UMUL_TIME 8
375 #else
376 #define UMUL_TIME 30
377 #endif
378 #define UDIV_TIME 40
379 #define count_leading_zeros(count, x) \
380   do {									\
381     USItype __tmp;							\
382     __asm__ (								\
383        "ldi		1,%0\n"						\
384 "	extru,=		%1,15,16,%%r0		; Bits 31..16 zero?\n"	\
385 "	extru,tr	%1,15,16,%1		; No.  Shift down, skip add.\n"\
386 "	ldo		16(%0),%0		; Yes.  Perform add.\n"	\
387 "	extru,=		%1,23,8,%%r0		; Bits 15..8 zero?\n"	\
388 "	extru,tr	%1,23,8,%1		; No.  Shift down, skip add.\n"\
389 "	ldo		8(%0),%0		; Yes.  Perform add.\n"	\
390 "	extru,=		%1,27,4,%%r0		; Bits 7..4 zero?\n"	\
391 "	extru,tr	%1,27,4,%1		; No.  Shift down, skip add.\n"\
392 "	ldo		4(%0),%0		; Yes.  Perform add.\n"	\
393 "	extru,=		%1,29,2,%%r0		; Bits 3..2 zero?\n"	\
394 "	extru,tr	%1,29,2,%1		; No.  Shift down, skip add.\n"\
395 "	ldo		2(%0),%0		; Yes.  Perform add.\n"	\
396 "	extru		%1,30,1,%1		; Extract bit 1.\n"	\
397 "	sub		%0,%1,%0		; Subtract it.\n"	\
398 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
399   } while (0)
400 #endif
401 
402 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
403 #if !defined (__zarch__)
404 #define smul_ppmm(xh, xl, m0, m1) \
405   do {									\
406     union {DItype __ll;							\
407 	   struct {USItype __h, __l;} __i;				\
408 	  } __x;							\
409     __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
410 	     : "=&r" (__x.__ll)						\
411 	     : "r" (m0), "r" (m1));					\
412     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
413   } while (0)
414 #define sdiv_qrnnd(q, r, n1, n0, d) \
415   do {									\
416     union {DItype __ll;							\
417 	   struct {USItype __h, __l;} __i;				\
418 	  } __x;							\
419     __x.__i.__h = n1; __x.__i.__l = n0;					\
420     __asm__ ("dr %0,%2"							\
421 	     : "=r" (__x.__ll)						\
422 	     : "0" (__x.__ll), "r" (d));				\
423     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
424   } while (0)
425 #else
426 #define smul_ppmm(xh, xl, m0, m1) \
427   do {                                                                  \
428     register SItype __r0 __asm__ ("0");					\
429     register SItype __r1 __asm__ ("1") = (m0);				\
430 									\
431     __asm__ ("mr\t%%r0,%3"                                              \
432 	     : "=r" (__r0), "=r" (__r1)					\
433 	     : "r"  (__r1),  "r" (m1));					\
434     (xh) = __r0; (xl) = __r1;						\
435   } while (0)
436 
437 #define sdiv_qrnnd(q, r, n1, n0, d) \
438   do {									\
439     register SItype __r0 __asm__ ("0") = (n1);				\
440     register SItype __r1 __asm__ ("1") = (n0);				\
441 									\
442     __asm__ ("dr\t%%r0,%4"                                              \
443 	     : "=r" (__r0), "=r" (__r1)					\
444 	     : "r" (__r0), "r" (__r1), "r" (d));			\
445     (q) = __r1; (r) = __r0;						\
446   } while (0)
447 #endif /* __zarch__ */
448 #endif
449 
450 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
451 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
452   __asm__ ("add{l} {%5,%1|%1,%5}\n\tadc{l} {%3,%0|%0,%3}"		\
453 	   : "=r" ((USItype) (sh)),					\
454 	     "=&r" ((USItype) (sl))					\
455 	   : "%0" ((USItype) (ah)),					\
456 	     "g" ((USItype) (bh)),					\
457 	     "%1" ((USItype) (al)),					\
458 	     "g" ((USItype) (bl)))
459 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
460   __asm__ ("sub{l} {%5,%1|%1,%5}\n\tsbb{l} {%3,%0|%0,%3}"		\
461 	   : "=r" ((USItype) (sh)),					\
462 	     "=&r" ((USItype) (sl))					\
463 	   : "0" ((USItype) (ah)),					\
464 	     "g" ((USItype) (bh)),					\
465 	     "1" ((USItype) (al)),					\
466 	     "g" ((USItype) (bl)))
467 #define umul_ppmm(w1, w0, u, v) \
468   __asm__ ("mul{l} %3"							\
469 	   : "=a" ((USItype) (w0)),					\
470 	     "=d" ((USItype) (w1))					\
471 	   : "%0" ((USItype) (u)),					\
472 	     "rm" ((USItype) (v)))
473 #define udiv_qrnnd(q, r, n1, n0, dv) \
474   __asm__ ("div{l} %4"							\
475 	   : "=a" ((USItype) (q)),					\
476 	     "=d" ((USItype) (r))					\
477 	   : "0" ((USItype) (n0)),					\
478 	     "1" ((USItype) (n1)),					\
479 	     "rm" ((USItype) (dv)))
480 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
481 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
482 #define UMUL_TIME 40
483 #define UDIV_TIME 40
484 #endif /* 80x86 */
485 
486 #if defined (__x86_64__) && W_TYPE_SIZE == 64
487 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
488   __asm__ ("add{q} {%5,%1|%1,%5}\n\tadc{q} {%3,%0|%0,%3}"		\
489 	   : "=r" ((UDItype) (sh)),					\
490 	     "=&r" ((UDItype) (sl))					\
491 	   : "%0" ((UDItype) (ah)),					\
492 	     "rme" ((UDItype) (bh)),					\
493 	     "%1" ((UDItype) (al)),					\
494 	     "rme" ((UDItype) (bl)))
495 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
496   __asm__ ("sub{q} {%5,%1|%1,%5}\n\tsbb{q} {%3,%0|%0,%3}"		\
497 	   : "=r" ((UDItype) (sh)),					\
498 	     "=&r" ((UDItype) (sl))					\
499 	   : "0" ((UDItype) (ah)),					\
500 	     "rme" ((UDItype) (bh)),					\
501 	     "1" ((UDItype) (al)),					\
502 	     "rme" ((UDItype) (bl)))
503 #define umul_ppmm(w1, w0, u, v) \
504   __asm__ ("mul{q} %3"							\
505 	   : "=a" ((UDItype) (w0)),					\
506 	     "=d" ((UDItype) (w1))					\
507 	   : "%0" ((UDItype) (u)),					\
508 	     "rm" ((UDItype) (v)))
509 #define udiv_qrnnd(q, r, n1, n0, dv) \
510   __asm__ ("div{q} %4"							\
511 	   : "=a" ((UDItype) (q)),					\
512 	     "=d" ((UDItype) (r))					\
513 	   : "0" ((UDItype) (n0)),					\
514 	     "1" ((UDItype) (n1)),					\
515 	     "rm" ((UDItype) (dv)))
516 #define count_leading_zeros(count, x)	((count) = __builtin_clzll (x))
517 #define count_trailing_zeros(count, x)	((count) = __builtin_ctzll (x))
518 #define UMUL_TIME 40
519 #define UDIV_TIME 40
520 #endif /* x86_64 */
521 
522 #if defined (__i960__) && W_TYPE_SIZE == 32
523 #define umul_ppmm(w1, w0, u, v) \
524   ({union {UDItype __ll;						\
525 	   struct {USItype __l, __h;} __i;				\
526 	  } __xx;							\
527   __asm__ ("emul	%2,%1,%0"					\
528 	   : "=d" (__xx.__ll)						\
529 	   : "%dI" ((USItype) (u)),					\
530 	     "dI" ((USItype) (v)));					\
531   (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
532 #define __umulsidi3(u, v) \
533   ({UDItype __w;							\
534     __asm__ ("emul	%2,%1,%0"					\
535 	     : "=d" (__w)						\
536 	     : "%dI" ((USItype) (u)),					\
537 	       "dI" ((USItype) (v)));					\
538     __w; })
539 #endif /* __i960__ */
540 
541 #if defined (__ia64) && W_TYPE_SIZE == 64
542 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
543    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
544    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
545    register, which takes an extra cycle.  */
546 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
547   do {									\
548     UWtype __x;								\
549     __x = (al) - (bl);							\
550     if ((al) < (bl))							\
551       (sh) = (ah) - (bh) - 1;						\
552     else								\
553       (sh) = (ah) - (bh);						\
554     (sl) = __x;								\
555   } while (0)
556 
557 /* Do both product parts in assembly, since that gives better code with
558    all gcc versions.  Some callers will just use the upper part, and in
559    that situation we waste an instruction, but not any cycles.  */
560 #define umul_ppmm(ph, pl, m0, m1)					\
561   __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
562 	   : "=&f" (ph), "=f" (pl)					\
563 	   : "f" (m0), "f" (m1))
564 #define count_leading_zeros(count, x)					\
565   do {									\
566     UWtype _x = (x), _y, _a, _c;					\
567     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
568     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
569     _c = (_a - 1) << 3;							\
570     _x >>= _c;								\
571     if (_x >= 1 << 4)							\
572       _x >>= 4, _c += 4;						\
573     if (_x >= 1 << 2)							\
574       _x >>= 2, _c += 2;						\
575     _c += _x >> 1;							\
576     (count) =  W_TYPE_SIZE - 1 - _c;					\
577   } while (0)
578 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
579    based, and we don't need a special case for x==0 here */
580 #define count_trailing_zeros(count, x)					\
581   do {									\
582     UWtype __ctz_x = (x);						\
583     __asm__ ("popcnt %0 = %1"						\
584 	     : "=r" (count)						\
585 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
586   } while (0)
587 #define UMUL_TIME 14
588 #endif
589 
590 #if defined (__M32R__) && W_TYPE_SIZE == 32
591 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
592   /* The cmp clears the condition bit.  */ \
593   __asm__ ("cmp %0,%0\n\taddx %1,%5\n\taddx %0,%3"			\
594 	   : "=r" ((USItype) (sh)),					\
595 	     "=&r" ((USItype) (sl))					\
596 	   : "0" ((USItype) (ah)),					\
597 	     "r" ((USItype) (bh)),					\
598 	     "1" ((USItype) (al)),					\
599 	     "r" ((USItype) (bl))					\
600 	   : "cbit")
601 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
602   /* The cmp clears the condition bit.  */ \
603   __asm__ ("cmp %0,%0\n\tsubx %1,%5\n\tsubx %0,%3"			\
604 	   : "=r" ((USItype) (sh)),					\
605 	     "=&r" ((USItype) (sl))					\
606 	   : "0" ((USItype) (ah)),					\
607 	     "r" ((USItype) (bh)),					\
608 	     "1" ((USItype) (al)),					\
609 	     "r" ((USItype) (bl))					\
610 	   : "cbit")
611 #endif /* __M32R__ */
612 
613 #if defined (__mc68000__) && W_TYPE_SIZE == 32
614 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
615   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
616 	   : "=d" ((USItype) (sh)),					\
617 	     "=&d" ((USItype) (sl))					\
618 	   : "%0" ((USItype) (ah)),					\
619 	     "d" ((USItype) (bh)),					\
620 	     "%1" ((USItype) (al)),					\
621 	     "g" ((USItype) (bl)))
622 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
623   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
624 	   : "=d" ((USItype) (sh)),					\
625 	     "=&d" ((USItype) (sl))					\
626 	   : "0" ((USItype) (ah)),					\
627 	     "d" ((USItype) (bh)),					\
628 	     "1" ((USItype) (al)),					\
629 	     "g" ((USItype) (bl)))
630 
631 /* The '020, '030, '040, '060 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
632 #if (defined (__mc68020__) && !defined (__mc68060__))
633 #define umul_ppmm(w1, w0, u, v) \
634   __asm__ ("mulu%.l %3,%1:%0"						\
635 	   : "=d" ((USItype) (w0)),					\
636 	     "=d" ((USItype) (w1))					\
637 	   : "%0" ((USItype) (u)),					\
638 	     "dmi" ((USItype) (v)))
639 #define UMUL_TIME 45
640 #define udiv_qrnnd(q, r, n1, n0, d) \
641   __asm__ ("divu%.l %4,%1:%0"						\
642 	   : "=d" ((USItype) (q)),					\
643 	     "=d" ((USItype) (r))					\
644 	   : "0" ((USItype) (n0)),					\
645 	     "1" ((USItype) (n1)),					\
646 	     "dmi" ((USItype) (d)))
647 #define UDIV_TIME 90
648 #define sdiv_qrnnd(q, r, n1, n0, d) \
649   __asm__ ("divs%.l %4,%1:%0"						\
650 	   : "=d" ((USItype) (q)),					\
651 	     "=d" ((USItype) (r))					\
652 	   : "0" ((USItype) (n0)),					\
653 	     "1" ((USItype) (n1)),					\
654 	     "dmi" ((USItype) (d)))
655 
656 #elif defined (__mcoldfire__) /* not mc68020 */
657 
658 #define umul_ppmm(xh, xl, a, b) \
659   __asm__ ("| Inlined umul_ppmm\n"					\
660 	   "	move%.l	%2,%/d0\n"					\
661 	   "	move%.l	%3,%/d1\n"					\
662 	   "	move%.l	%/d0,%/d2\n"					\
663 	   "	swap	%/d0\n"						\
664 	   "	move%.l	%/d1,%/d3\n"					\
665 	   "	swap	%/d1\n"						\
666 	   "	move%.w	%/d2,%/d4\n"					\
667 	   "	mulu	%/d3,%/d4\n"					\
668 	   "	mulu	%/d1,%/d2\n"					\
669 	   "	mulu	%/d0,%/d3\n"					\
670 	   "	mulu	%/d0,%/d1\n"					\
671 	   "	move%.l	%/d4,%/d0\n"					\
672 	   "	clr%.w	%/d0\n"						\
673 	   "	swap	%/d0\n"						\
674 	   "	add%.l	%/d0,%/d2\n"					\
675 	   "	add%.l	%/d3,%/d2\n"					\
676 	   "	jcc	1f\n"						\
677 	   "	add%.l	%#65536,%/d1\n"					\
678 	   "1:	swap	%/d2\n"						\
679 	   "	moveq	%#0,%/d0\n"					\
680 	   "	move%.w	%/d2,%/d0\n"					\
681 	   "	move%.w	%/d4,%/d2\n"					\
682 	   "	move%.l	%/d2,%1\n"					\
683 	   "	add%.l	%/d1,%/d0\n"					\
684 	   "	move%.l	%/d0,%0"					\
685 	   : "=g" ((USItype) (xh)),					\
686 	     "=g" ((USItype) (xl))					\
687 	   : "g" ((USItype) (a)),					\
688 	     "g" ((USItype) (b))					\
689 	   : "d0", "d1", "d2", "d3", "d4")
690 #define UMUL_TIME 100
691 #define UDIV_TIME 400
692 #else /* not ColdFire */
693 /* %/ inserts REGISTER_PREFIX, %# inserts IMMEDIATE_PREFIX.  */
694 #define umul_ppmm(xh, xl, a, b) \
695   __asm__ ("| Inlined umul_ppmm\n"					\
696 	   "	move%.l	%2,%/d0\n"					\
697 	   "	move%.l	%3,%/d1\n"					\
698 	   "	move%.l	%/d0,%/d2\n"					\
699 	   "	swap	%/d0\n"						\
700 	   "	move%.l	%/d1,%/d3\n"					\
701 	   "	swap	%/d1\n"						\
702 	   "	move%.w	%/d2,%/d4\n"					\
703 	   "	mulu	%/d3,%/d4\n"					\
704 	   "	mulu	%/d1,%/d2\n"					\
705 	   "	mulu	%/d0,%/d3\n"					\
706 	   "	mulu	%/d0,%/d1\n"					\
707 	   "	move%.l	%/d4,%/d0\n"					\
708 	   "	eor%.w	%/d0,%/d0\n"					\
709 	   "	swap	%/d0\n"						\
710 	   "	add%.l	%/d0,%/d2\n"					\
711 	   "	add%.l	%/d3,%/d2\n"					\
712 	   "	jcc	1f\n"						\
713 	   "	add%.l	%#65536,%/d1\n"					\
714 	   "1:	swap	%/d2\n"						\
715 	   "	moveq	%#0,%/d0\n"					\
716 	   "	move%.w	%/d2,%/d0\n"					\
717 	   "	move%.w	%/d4,%/d2\n"					\
718 	   "	move%.l	%/d2,%1\n"					\
719 	   "	add%.l	%/d1,%/d0\n"					\
720 	   "	move%.l	%/d0,%0"					\
721 	   : "=g" ((USItype) (xh)),					\
722 	     "=g" ((USItype) (xl))					\
723 	   : "g" ((USItype) (a)),					\
724 	     "g" ((USItype) (b))					\
725 	   : "d0", "d1", "d2", "d3", "d4")
726 #define UMUL_TIME 100
727 #define UDIV_TIME 400
728 
729 #endif /* not mc68020 */
730 
731 /* The '020, '030, '040 and '060 have bitfield insns.
732    cpu32 disguises as a 68020, but lacks them.  */
733 #if defined (__mc68020__) && !defined (__mcpu32__)
734 #define count_leading_zeros(count, x) \
735   __asm__ ("bfffo %1{%b2:%b2},%0"					\
736 	   : "=d" ((USItype) (count))					\
737 	   : "od" ((USItype) (x)), "n" (0))
738 /* Some ColdFire architectures have a ff1 instruction supported via
739    __builtin_clz. */
740 #elif defined (__mcfisaaplus__) || defined (__mcfisac__)
741 #define count_leading_zeros(count,x) ((count) = __builtin_clz (x))
742 #define COUNT_LEADING_ZEROS_0 32
743 #endif
744 #endif /* mc68000 */
745 
746 #if defined (__m88000__) && W_TYPE_SIZE == 32
747 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
748   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
749 	   : "=r" ((USItype) (sh)),					\
750 	     "=&r" ((USItype) (sl))					\
751 	   : "%rJ" ((USItype) (ah)),					\
752 	     "rJ" ((USItype) (bh)),					\
753 	     "%rJ" ((USItype) (al)),					\
754 	     "rJ" ((USItype) (bl)))
755 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
756   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
757 	   : "=r" ((USItype) (sh)),					\
758 	     "=&r" ((USItype) (sl))					\
759 	   : "rJ" ((USItype) (ah)),					\
760 	     "rJ" ((USItype) (bh)),					\
761 	     "rJ" ((USItype) (al)),					\
762 	     "rJ" ((USItype) (bl)))
763 #define count_leading_zeros(count, x) \
764   do {									\
765     USItype __cbtmp;							\
766     __asm__ ("ff1 %0,%1"						\
767 	     : "=r" (__cbtmp)						\
768 	     : "r" ((USItype) (x)));					\
769     (count) = __cbtmp ^ 31;						\
770   } while (0)
771 #define COUNT_LEADING_ZEROS_0 63 /* sic */
772 #if defined (__mc88110__)
773 #define umul_ppmm(wh, wl, u, v) \
774   do {									\
775     union {UDItype __ll;						\
776 	   struct {USItype __h, __l;} __i;				\
777 	  } __xx;							\
778     __asm__ ("mulu.d	%0,%1,%2"					\
779 	     : "=r" (__xx.__ll)						\
780 	     : "r" ((USItype) (u)),					\
781 	       "r" ((USItype) (v)));					\
782     (wh) = __xx.__i.__h;						\
783     (wl) = __xx.__i.__l;						\
784   } while (0)
785 #define udiv_qrnnd(q, r, n1, n0, d) \
786   ({union {UDItype __ll;						\
787 	   struct {USItype __h, __l;} __i;				\
788 	  } __xx;							\
789   USItype __q;								\
790   __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
791   __asm__ ("divu.d %0,%1,%2"						\
792 	   : "=r" (__q)							\
793 	   : "r" (__xx.__ll),						\
794 	     "r" ((USItype) (d)));					\
795   (r) = (n0) - __q * (d); (q) = __q; })
796 #define UMUL_TIME 5
797 #define UDIV_TIME 25
798 #else
799 #define UMUL_TIME 17
800 #define UDIV_TIME 150
801 #endif /* __mc88110__ */
802 #endif /* __m88000__ */
803 
804 #if defined (__mn10300__)
805 # if defined (__AM33__)
806 #  define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
807 #  define umul_ppmm(w1, w0, u, v)		\
808     asm("mulu %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
809 #  define smul_ppmm(w1, w0, u, v)		\
810     asm("mul %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
811 # else
812 #  define umul_ppmm(w1, w0, u, v)		\
813     asm("nop; nop; mulu %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
814 #  define smul_ppmm(w1, w0, u, v)		\
815     asm("nop; nop; mul %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
816 # endif
817 # define add_ssaaaa(sh, sl, ah, al, bh, bl)	\
818   do {						\
819     DWunion __s, __a, __b;			\
820     __a.s.low = (al); __a.s.high = (ah);	\
821     __b.s.low = (bl); __b.s.high = (bh);	\
822     __s.ll = __a.ll + __b.ll;			\
823     (sl) = __s.s.low; (sh) = __s.s.high;	\
824   } while (0)
825 # define sub_ddmmss(sh, sl, ah, al, bh, bl)	\
826   do {						\
827     DWunion __s, __a, __b;			\
828     __a.s.low = (al); __a.s.high = (ah);	\
829     __b.s.low = (bl); __b.s.high = (bh);	\
830     __s.ll = __a.ll - __b.ll;			\
831     (sl) = __s.s.low; (sh) = __s.s.high;	\
832   } while (0)
833 # define udiv_qrnnd(q, r, nh, nl, d)		\
834   asm("divu %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
835 # define sdiv_qrnnd(q, r, nh, nl, d)		\
836   asm("div %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
837 # define UMUL_TIME 3
838 # define UDIV_TIME 38
839 #endif
840 
841 #if defined (__mips__) && W_TYPE_SIZE == 32
842 #define umul_ppmm(w1, w0, u, v)						\
843   do {									\
844     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
845     (w1) = (USItype) (__x >> 32);					\
846     (w0) = (USItype) (__x);						\
847   } while (0)
848 #define UMUL_TIME 10
849 #define UDIV_TIME 100
850 
851 #if (__mips == 32 || __mips == 64) && ! defined (__mips16)
852 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
853 #define COUNT_LEADING_ZEROS_0 32
854 #endif
855 #endif /* __mips__ */
856 
857 #if defined (__ns32000__) && W_TYPE_SIZE == 32
858 #define umul_ppmm(w1, w0, u, v) \
859   ({union {UDItype __ll;						\
860 	   struct {USItype __l, __h;} __i;				\
861 	  } __xx;							\
862   __asm__ ("meid %2,%0"							\
863 	   : "=g" (__xx.__ll)						\
864 	   : "%0" ((USItype) (u)),					\
865 	     "g" ((USItype) (v)));					\
866   (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
867 #define __umulsidi3(u, v) \
868   ({UDItype __w;							\
869     __asm__ ("meid %2,%0"						\
870 	     : "=g" (__w)						\
871 	     : "%0" ((USItype) (u)),					\
872 	       "g" ((USItype) (v)));					\
873     __w; })
874 #define udiv_qrnnd(q, r, n1, n0, d) \
875   ({union {UDItype __ll;						\
876 	   struct {USItype __l, __h;} __i;				\
877 	  } __xx;							\
878   __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
879   __asm__ ("deid %2,%0"							\
880 	   : "=g" (__xx.__ll)						\
881 	   : "0" (__xx.__ll),						\
882 	     "g" ((USItype) (d)));					\
883   (r) = __xx.__i.__l; (q) = __xx.__i.__h; })
884 #define count_trailing_zeros(count,x) \
885   do {									\
886     __asm__ ("ffsd     %2,%0"						\
887 	    : "=r" ((USItype) (count))					\
888 	    : "0" ((USItype) 0),					\
889 	      "r" ((USItype) (x)));					\
890   } while (0)
891 #endif /* __ns32000__ */
892 
893 /* FIXME: We should test _IBMR2 here when we add assembly support for the
894    system vendor compilers.
895    FIXME: What's needed for gcc PowerPC VxWorks?  __vxworks__ is not good
896    enough, since that hits ARM and m68k too.  */
897 #if (defined (_ARCH_PPC)	/* AIX */				\
898      || defined (__powerpc__)	/* gcc */				\
899      || defined (__POWERPC__)	/* BEOS */				\
900      || defined (__ppc__)	/* Darwin */				\
901      || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */    \
902      || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */               \
903 	 && CPU_FAMILY == PPC)                                                \
904      ) && W_TYPE_SIZE == 32
905 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
906   do {									\
907     if (__builtin_constant_p (bh) && (bh) == 0)				\
908       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
909 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
910     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
911       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
912 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
913     else								\
914       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
915 	     : "=r" (sh), "=&r" (sl)					\
916 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
917   } while (0)
918 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
919   do {									\
920     if (__builtin_constant_p (ah) && (ah) == 0)				\
921       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
922 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
923     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
924       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
925 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
926     else if (__builtin_constant_p (bh) && (bh) == 0)			\
927       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
928 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
929     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
930       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
931 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
932     else								\
933       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
934 	       : "=r" (sh), "=&r" (sl)					\
935 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
936   } while (0)
937 #define count_leading_zeros(count, x) \
938   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
939 #define COUNT_LEADING_ZEROS_0 32
940 #if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \
941   || defined (__ppc__)                                                    \
942   || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */       \
943   || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */                  \
944 	 && CPU_FAMILY == PPC)
945 #define umul_ppmm(ph, pl, m0, m1) \
946   do {									\
947     USItype __m0 = (m0), __m1 = (m1);					\
948     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
949     (pl) = __m0 * __m1;							\
950   } while (0)
951 #define UMUL_TIME 15
952 #define smul_ppmm(ph, pl, m0, m1) \
953   do {									\
954     SItype __m0 = (m0), __m1 = (m1);					\
955     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
956     (pl) = __m0 * __m1;							\
957   } while (0)
958 #define SMUL_TIME 14
959 #define UDIV_TIME 120
960 #endif
961 #endif /* 32-bit POWER architecture variants.  */
962 
963 /* We should test _IBMR2 here when we add assembly support for the system
964    vendor compilers.  */
965 #if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64
966 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
967   do {									\
968     if (__builtin_constant_p (bh) && (bh) == 0)				\
969       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
970 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
971     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
972       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
973 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
974     else								\
975       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
976 	     : "=r" (sh), "=&r" (sl)					\
977 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
978   } while (0)
979 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
980   do {									\
981     if (__builtin_constant_p (ah) && (ah) == 0)				\
982       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
983 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
984     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
985       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
986 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
987     else if (__builtin_constant_p (bh) && (bh) == 0)			\
988       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
989 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
990     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
991       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
992 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
993     else								\
994       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
995 	       : "=r" (sh), "=&r" (sl)					\
996 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
997   } while (0)
998 #define count_leading_zeros(count, x) \
999   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1000 #define COUNT_LEADING_ZEROS_0 64
1001 #define umul_ppmm(ph, pl, m0, m1) \
1002   do {									\
1003     UDItype __m0 = (m0), __m1 = (m1);					\
1004     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1005     (pl) = __m0 * __m1;							\
1006   } while (0)
1007 #define UMUL_TIME 15
1008 #define smul_ppmm(ph, pl, m0, m1) \
1009   do {									\
1010     DItype __m0 = (m0), __m1 = (m1);					\
1011     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
1012     (pl) = __m0 * __m1;							\
1013   } while (0)
1014 #define SMUL_TIME 14  /* ??? */
1015 #define UDIV_TIME 120 /* ??? */
1016 #endif /* 64-bit PowerPC.  */
1017 
1018 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1019 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1020   __asm__ ("a %1,%5\n\tae %0,%3"					\
1021 	   : "=r" ((USItype) (sh)),					\
1022 	     "=&r" ((USItype) (sl))					\
1023 	   : "%0" ((USItype) (ah)),					\
1024 	     "r" ((USItype) (bh)),					\
1025 	     "%1" ((USItype) (al)),					\
1026 	     "r" ((USItype) (bl)))
1027 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1028   __asm__ ("s %1,%5\n\tse %0,%3"					\
1029 	   : "=r" ((USItype) (sh)),					\
1030 	     "=&r" ((USItype) (sl))					\
1031 	   : "0" ((USItype) (ah)),					\
1032 	     "r" ((USItype) (bh)),					\
1033 	     "1" ((USItype) (al)),					\
1034 	     "r" ((USItype) (bl)))
1035 #define umul_ppmm(ph, pl, m0, m1) \
1036   do {									\
1037     USItype __m0 = (m0), __m1 = (m1);					\
1038     __asm__ (								\
1039        "s	r2,r2\n"						\
1040 "	mts	r10,%2\n"						\
1041 "	m	r2,%3\n"						\
1042 "	m	r2,%3\n"						\
1043 "	m	r2,%3\n"						\
1044 "	m	r2,%3\n"						\
1045 "	m	r2,%3\n"						\
1046 "	m	r2,%3\n"						\
1047 "	m	r2,%3\n"						\
1048 "	m	r2,%3\n"						\
1049 "	m	r2,%3\n"						\
1050 "	m	r2,%3\n"						\
1051 "	m	r2,%3\n"						\
1052 "	m	r2,%3\n"						\
1053 "	m	r2,%3\n"						\
1054 "	m	r2,%3\n"						\
1055 "	m	r2,%3\n"						\
1056 "	m	r2,%3\n"						\
1057 "	cas	%0,r2,r0\n"						\
1058 "	mfs	r10,%1"							\
1059 	     : "=r" ((USItype) (ph)),					\
1060 	       "=r" ((USItype) (pl))					\
1061 	     : "%r" (__m0),						\
1062 		"r" (__m1)						\
1063 	     : "r2");							\
1064     (ph) += ((((SItype) __m0 >> 31) & __m1)				\
1065 	     + (((SItype) __m1 >> 31) & __m0));				\
1066   } while (0)
1067 #define UMUL_TIME 20
1068 #define UDIV_TIME 200
1069 #define count_leading_zeros(count, x) \
1070   do {									\
1071     if ((x) >= 0x10000)							\
1072       __asm__ ("clz	%0,%1"						\
1073 	       : "=r" ((USItype) (count))				\
1074 	       : "r" ((USItype) (x) >> 16));				\
1075     else								\
1076       {									\
1077 	__asm__ ("clz	%0,%1"						\
1078 		 : "=r" ((USItype) (count))				\
1079 		 : "r" ((USItype) (x)));					\
1080 	(count) += 16;							\
1081       }									\
1082   } while (0)
1083 #endif
1084 
1085 #if defined(__sh__) && !__SHMEDIA__ && W_TYPE_SIZE == 32
1086 #ifndef __sh1__
1087 #define umul_ppmm(w1, w0, u, v) \
1088   __asm__ (								\
1089        "dmulu.l	%2,%3\n\tsts%M1	macl,%1\n\tsts%M0	mach,%0"	\
1090 	   : "=r<" ((USItype)(w1)),					\
1091 	     "=r<" ((USItype)(w0))					\
1092 	   : "r" ((USItype)(u)),					\
1093 	     "r" ((USItype)(v))						\
1094 	   : "macl", "mach")
1095 #define UMUL_TIME 5
1096 #endif
1097 
1098 /* This is the same algorithm as __udiv_qrnnd_c.  */
1099 #define UDIV_NEEDS_NORMALIZATION 1
1100 
1101 #define udiv_qrnnd(q, r, n1, n0, d) \
1102   do {									\
1103     extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
1104 			__attribute__ ((visibility ("hidden")));	\
1105     /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
1106     __asm__ (								\
1107 	"mov%M4 %4,r5\n"						\
1108 "	swap.w %3,r4\n"							\
1109 "	swap.w r5,r6\n"							\
1110 "	jsr @%5\n"							\
1111 "	shll16 r6\n"							\
1112 "	swap.w r4,r4\n"							\
1113 "	jsr @%5\n"							\
1114 "	swap.w r1,%0\n"							\
1115 "	or r1,%0"							\
1116 	: "=r" (q), "=&z" (r)						\
1117 	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
1118 	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
1119   } while (0)
1120 
1121 #define UDIV_TIME 80
1122 
1123 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
1124   __asm__ ("clrt;subc %5,%1; subc %4,%0"				\
1125 	   : "=r" (sh), "=r" (sl)					\
1126 	   : "0" (ah), "1" (al), "r" (bh), "r" (bl) : "t")
1127 
1128 #endif /* __sh__ */
1129 
1130 #if defined (__SH5__) && __SHMEDIA__ && W_TYPE_SIZE == 32
1131 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
1132 #define count_leading_zeros(count, x) \
1133   do									\
1134     {									\
1135       UDItype x_ = (USItype)(x);					\
1136       SItype c_;							\
1137 									\
1138       __asm__ ("nsb %1, %0" : "=r" (c_) : "r" (x_));			\
1139       (count) = c_ - 31;						\
1140     }									\
1141   while (0)
1142 #define COUNT_LEADING_ZEROS_0 32
1143 #endif
1144 
1145 #if defined (__sparc__) && !defined (__arch64__) && !defined (__sparcv9) \
1146     && W_TYPE_SIZE == 32
1147 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1148   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
1149 	   : "=r" ((USItype) (sh)),					\
1150 	     "=&r" ((USItype) (sl))					\
1151 	   : "%rJ" ((USItype) (ah)),					\
1152 	     "rI" ((USItype) (bh)),					\
1153 	     "%rJ" ((USItype) (al)),					\
1154 	     "rI" ((USItype) (bl))					\
1155 	   __CLOBBER_CC)
1156 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1157   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
1158 	   : "=r" ((USItype) (sh)),					\
1159 	     "=&r" ((USItype) (sl))					\
1160 	   : "rJ" ((USItype) (ah)),					\
1161 	     "rI" ((USItype) (bh)),					\
1162 	     "rJ" ((USItype) (al)),					\
1163 	     "rI" ((USItype) (bl))					\
1164 	   __CLOBBER_CC)
1165 #if defined (__sparc_v9__)
1166 #define umul_ppmm(w1, w0, u, v) \
1167   do {									\
1168     register USItype __g1 asm ("g1");					\
1169     __asm__ ("umul\t%2,%3,%1\n\t"					\
1170 	     "srlx\t%1, 32, %0"						\
1171 	     : "=r" ((USItype) (w1)),					\
1172 	       "=r" (__g1)						\
1173 	     : "r" ((USItype) (u)),					\
1174 	       "r" ((USItype) (v)));					\
1175     (w0) = __g1;							\
1176   } while (0)
1177 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
1178   __asm__ ("mov\t%2,%%y\n\t"						\
1179 	   "udiv\t%3,%4,%0\n\t"						\
1180 	   "umul\t%0,%4,%1\n\t"						\
1181 	   "sub\t%3,%1,%1"						\
1182 	   : "=&r" ((USItype) (__q)),					\
1183 	     "=&r" ((USItype) (__r))					\
1184 	   : "r" ((USItype) (__n1)),					\
1185 	     "r" ((USItype) (__n0)),					\
1186 	     "r" ((USItype) (__d)))
1187 #else
1188 #if defined (__sparc_v8__)
1189 #define umul_ppmm(w1, w0, u, v) \
1190   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
1191 	   : "=r" ((USItype) (w1)),					\
1192 	     "=r" ((USItype) (w0))					\
1193 	   : "r" ((USItype) (u)),					\
1194 	     "r" ((USItype) (v)))
1195 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
1196   __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
1197 	   : "=&r" ((USItype) (__q)),					\
1198 	     "=&r" ((USItype) (__r))					\
1199 	   : "r" ((USItype) (__n1)),					\
1200 	     "r" ((USItype) (__n0)),					\
1201 	     "r" ((USItype) (__d)))
1202 #else
1203 #if defined (__sparclite__)
1204 /* This has hardware multiply but not divide.  It also has two additional
1205    instructions scan (ffs from high bit) and divscc.  */
1206 #define umul_ppmm(w1, w0, u, v) \
1207   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
1208 	   : "=r" ((USItype) (w1)),					\
1209 	     "=r" ((USItype) (w0))					\
1210 	   : "r" ((USItype) (u)),					\
1211 	     "r" ((USItype) (v)))
1212 #define udiv_qrnnd(q, r, n1, n0, d) \
1213   __asm__ ("! Inlined udiv_qrnnd\n"					\
1214 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
1215 "	tst	%%g0\n"							\
1216 "	divscc	%3,%4,%%g1\n"						\
1217 "	divscc	%%g1,%4,%%g1\n"						\
1218 "	divscc	%%g1,%4,%%g1\n"						\
1219 "	divscc	%%g1,%4,%%g1\n"						\
1220 "	divscc	%%g1,%4,%%g1\n"						\
1221 "	divscc	%%g1,%4,%%g1\n"						\
1222 "	divscc	%%g1,%4,%%g1\n"						\
1223 "	divscc	%%g1,%4,%%g1\n"						\
1224 "	divscc	%%g1,%4,%%g1\n"						\
1225 "	divscc	%%g1,%4,%%g1\n"						\
1226 "	divscc	%%g1,%4,%%g1\n"						\
1227 "	divscc	%%g1,%4,%%g1\n"						\
1228 "	divscc	%%g1,%4,%%g1\n"						\
1229 "	divscc	%%g1,%4,%%g1\n"						\
1230 "	divscc	%%g1,%4,%%g1\n"						\
1231 "	divscc	%%g1,%4,%%g1\n"						\
1232 "	divscc	%%g1,%4,%%g1\n"						\
1233 "	divscc	%%g1,%4,%%g1\n"						\
1234 "	divscc	%%g1,%4,%%g1\n"						\
1235 "	divscc	%%g1,%4,%%g1\n"						\
1236 "	divscc	%%g1,%4,%%g1\n"						\
1237 "	divscc	%%g1,%4,%%g1\n"						\
1238 "	divscc	%%g1,%4,%%g1\n"						\
1239 "	divscc	%%g1,%4,%%g1\n"						\
1240 "	divscc	%%g1,%4,%%g1\n"						\
1241 "	divscc	%%g1,%4,%%g1\n"						\
1242 "	divscc	%%g1,%4,%%g1\n"						\
1243 "	divscc	%%g1,%4,%%g1\n"						\
1244 "	divscc	%%g1,%4,%%g1\n"						\
1245 "	divscc	%%g1,%4,%%g1\n"						\
1246 "	divscc	%%g1,%4,%%g1\n"						\
1247 "	divscc	%%g1,%4,%0\n"						\
1248 "	rd	%%y,%1\n"						\
1249 "	bl,a 1f\n"							\
1250 "	add	%1,%4,%1\n"						\
1251 "1:	! End of inline udiv_qrnnd"					\
1252 	   : "=r" ((USItype) (q)),					\
1253 	     "=r" ((USItype) (r))					\
1254 	   : "r" ((USItype) (n1)),					\
1255 	     "r" ((USItype) (n0)),					\
1256 	     "rI" ((USItype) (d))					\
1257 	   : "g1" __AND_CLOBBER_CC)
1258 #define UDIV_TIME 37
1259 #define count_leading_zeros(count, x) \
1260   do {                                                                  \
1261   __asm__ ("scan %1,1,%0"                                               \
1262 	   : "=r" ((USItype) (count))                                   \
1263 	   : "r" ((USItype) (x)));					\
1264   } while (0)
1265 /* Early sparclites return 63 for an argument of 0, but they warn that future
1266    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1267    undefined.  */
1268 #else
1269 /* SPARC without integer multiplication and divide instructions.
1270    (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
1271 #define umul_ppmm(w1, w0, u, v) \
1272   __asm__ ("! Inlined umul_ppmm\n"					\
1273 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n"\
1274 "	sra	%3,31,%%o5	! Don't move this insn\n"		\
1275 "	and	%2,%%o5,%%o5	! Don't move this insn\n"		\
1276 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
1277 "	mulscc	%%g1,%3,%%g1\n"						\
1278 "	mulscc	%%g1,%3,%%g1\n"						\
1279 "	mulscc	%%g1,%3,%%g1\n"						\
1280 "	mulscc	%%g1,%3,%%g1\n"						\
1281 "	mulscc	%%g1,%3,%%g1\n"						\
1282 "	mulscc	%%g1,%3,%%g1\n"						\
1283 "	mulscc	%%g1,%3,%%g1\n"						\
1284 "	mulscc	%%g1,%3,%%g1\n"						\
1285 "	mulscc	%%g1,%3,%%g1\n"						\
1286 "	mulscc	%%g1,%3,%%g1\n"						\
1287 "	mulscc	%%g1,%3,%%g1\n"						\
1288 "	mulscc	%%g1,%3,%%g1\n"						\
1289 "	mulscc	%%g1,%3,%%g1\n"						\
1290 "	mulscc	%%g1,%3,%%g1\n"						\
1291 "	mulscc	%%g1,%3,%%g1\n"						\
1292 "	mulscc	%%g1,%3,%%g1\n"						\
1293 "	mulscc	%%g1,%3,%%g1\n"						\
1294 "	mulscc	%%g1,%3,%%g1\n"						\
1295 "	mulscc	%%g1,%3,%%g1\n"						\
1296 "	mulscc	%%g1,%3,%%g1\n"						\
1297 "	mulscc	%%g1,%3,%%g1\n"						\
1298 "	mulscc	%%g1,%3,%%g1\n"						\
1299 "	mulscc	%%g1,%3,%%g1\n"						\
1300 "	mulscc	%%g1,%3,%%g1\n"						\
1301 "	mulscc	%%g1,%3,%%g1\n"						\
1302 "	mulscc	%%g1,%3,%%g1\n"						\
1303 "	mulscc	%%g1,%3,%%g1\n"						\
1304 "	mulscc	%%g1,%3,%%g1\n"						\
1305 "	mulscc	%%g1,%3,%%g1\n"						\
1306 "	mulscc	%%g1,%3,%%g1\n"						\
1307 "	mulscc	%%g1,%3,%%g1\n"						\
1308 "	mulscc	%%g1,%3,%%g1\n"						\
1309 "	mulscc	%%g1,0,%%g1\n"						\
1310 "	add	%%g1,%%o5,%0\n"						\
1311 "	rd	%%y,%1"							\
1312 	   : "=r" ((USItype) (w1)),					\
1313 	     "=r" ((USItype) (w0))					\
1314 	   : "%rI" ((USItype) (u)),					\
1315 	     "r" ((USItype) (v))						\
1316 	   : "g1", "o5" __AND_CLOBBER_CC)
1317 #define UMUL_TIME 39		/* 39 instructions */
1318 /* It's quite necessary to add this much assembler for the sparc.
1319    The default udiv_qrnnd (in C) is more than 10 times slower!  */
1320 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
1321   __asm__ ("! Inlined udiv_qrnnd\n"					\
1322 "	mov	32,%%g1\n"						\
1323 "	subcc	%1,%2,%%g0\n"						\
1324 "1:	bcs	5f\n"							\
1325 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
1326 "	sub	%1,%2,%1	! this kills msb of n\n"		\
1327 "	addx	%1,%1,%1	! so this can't give carry\n"		\
1328 "	subcc	%%g1,1,%%g1\n"						\
1329 "2:	bne	1b\n"							\
1330 "	 subcc	%1,%2,%%g0\n"						\
1331 "	bcs	3f\n"							\
1332 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
1333 "	b	3f\n"							\
1334 "	 sub	%1,%2,%1	! this kills msb of n\n"		\
1335 "4:	sub	%1,%2,%1\n"						\
1336 "5:	addxcc	%1,%1,%1\n"						\
1337 "	bcc	2b\n"							\
1338 "	 subcc	%%g1,1,%%g1\n"						\
1339 "! Got carry from n.  Subtract next step to cancel this carry.\n"	\
1340 "	bne	4b\n"							\
1341 "	 addcc	%0,%0,%0	! shift n1n0 and a 0-bit in lsb\n"	\
1342 "	sub	%1,%2,%1\n"						\
1343 "3:	xnor	%0,0,%0\n"						\
1344 "	! End of inline udiv_qrnnd"					\
1345 	   : "=&r" ((USItype) (__q)),					\
1346 	     "=&r" ((USItype) (__r))					\
1347 	   : "r" ((USItype) (__d)),					\
1348 	     "1" ((USItype) (__n1)),					\
1349 	     "0" ((USItype) (__n0)) : "g1" __AND_CLOBBER_CC)
1350 #define UDIV_TIME (3+7*32)	/* 7 instructions/iteration. 32 iterations.  */
1351 #endif /* __sparclite__ */
1352 #endif /* __sparc_v8__ */
1353 #endif /* __sparc_v9__ */
1354 #endif /* sparc32 */
1355 
1356 #if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9)) \
1357     && W_TYPE_SIZE == 64
1358 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
1359   do {									\
1360     UDItype __carry = 0;						\
1361     __asm__ ("addcc\t%r5,%6,%1\n\t"					\
1362 	     "add\t%r3,%4,%0\n\t"					\
1363 	     "movcs\t%%xcc, 1, %2\n\t"					\
1364 	     "add\t%0, %2, %0"						\
1365 	     : "=r" ((UDItype)(sh)),				      	\
1366 	       "=&r" ((UDItype)(sl)),				      	\
1367 	       "+r" (__carry)				      		\
1368 	     : "%rJ" ((UDItype)(ah)),				     	\
1369 	       "rI" ((UDItype)(bh)),				      	\
1370 	       "%rJ" ((UDItype)(al)),				     	\
1371 	       "rI" ((UDItype)(bl))				       	\
1372 	     __CLOBBER_CC);						\
1373   } while (0)
1374 
1375 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
1376   do {									\
1377     UDItype __carry = 0;						\
1378     __asm__ ("subcc\t%r5,%6,%1\n\t"					\
1379 	     "sub\t%r3,%4,%0\n\t"					\
1380 	     "movcs\t%%xcc, 1, %2\n\t"					\
1381 	     "sub\t%0, %2, %0"						\
1382 	     : "=r" ((UDItype)(sh)),				      	\
1383 	       "=&r" ((UDItype)(sl)),				      	\
1384 	       "+r" (__carry)				      		\
1385 	     : "%rJ" ((UDItype)(ah)),				     	\
1386 	       "rI" ((UDItype)(bh)),				      	\
1387 	       "%rJ" ((UDItype)(al)),				     	\
1388 	       "rI" ((UDItype)(bl))				       	\
1389 	     __CLOBBER_CC);						\
1390   } while (0)
1391 
1392 #define umul_ppmm(wh, wl, u, v)						\
1393   do {									\
1394 	  UDItype tmp1, tmp2, tmp3, tmp4;				\
1395 	  __asm__ __volatile__ (					\
1396 		   "srl %7,0,%3\n\t"					\
1397 		   "mulx %3,%6,%1\n\t"					\
1398 		   "srlx %6,32,%2\n\t"					\
1399 		   "mulx %2,%3,%4\n\t"					\
1400 		   "sllx %4,32,%5\n\t"					\
1401 		   "srl %6,0,%3\n\t"					\
1402 		   "sub %1,%5,%5\n\t"					\
1403 		   "srlx %5,32,%5\n\t"					\
1404 		   "addcc %4,%5,%4\n\t"					\
1405 		   "srlx %7,32,%5\n\t"					\
1406 		   "mulx %3,%5,%3\n\t"					\
1407 		   "mulx %2,%5,%5\n\t"					\
1408 		   "sethi %%hi(0x80000000),%2\n\t"			\
1409 		   "addcc %4,%3,%4\n\t"					\
1410 		   "srlx %4,32,%4\n\t"					\
1411 		   "add %2,%2,%2\n\t"					\
1412 		   "movcc %%xcc,%%g0,%2\n\t"				\
1413 		   "addcc %5,%4,%5\n\t"					\
1414 		   "sllx %3,32,%3\n\t"					\
1415 		   "add %1,%3,%1\n\t"					\
1416 		   "add %5,%2,%0"					\
1417 	   : "=r" ((UDItype)(wh)),					\
1418 	     "=&r" ((UDItype)(wl)),					\
1419 	     "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4)	\
1420 	   : "r" ((UDItype)(u)),					\
1421 	     "r" ((UDItype)(v))						\
1422 	   __CLOBBER_CC);						\
1423   } while (0)
1424 #define UMUL_TIME 96
1425 #define UDIV_TIME 230
1426 #endif /* sparc64 */
1427 
1428 #if defined (__vax__) && W_TYPE_SIZE == 32
1429 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1430   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
1431 	   : "=g" ((USItype) (sh)),					\
1432 	     "=&g" ((USItype) (sl))					\
1433 	   : "%0" ((USItype) (ah)),					\
1434 	     "g" ((USItype) (bh)),					\
1435 	     "%1" ((USItype) (al)),					\
1436 	     "g" ((USItype) (bl)))
1437 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1438   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
1439 	   : "=g" ((USItype) (sh)),					\
1440 	     "=&g" ((USItype) (sl))					\
1441 	   : "0" ((USItype) (ah)),					\
1442 	     "g" ((USItype) (bh)),					\
1443 	     "1" ((USItype) (al)),					\
1444 	     "g" ((USItype) (bl)))
1445 #define umul_ppmm(xh, xl, m0, m1) \
1446   do {									\
1447     union {								\
1448 	UDItype __ll;							\
1449 	struct {USItype __l, __h;} __i;					\
1450       } __xx;								\
1451     USItype __m0 = (m0), __m1 = (m1);					\
1452     __asm__ ("emul %1,%2,$0,%0"						\
1453 	     : "=r" (__xx.__ll)						\
1454 	     : "g" (__m0),						\
1455 	       "g" (__m1));						\
1456     (xh) = __xx.__i.__h;						\
1457     (xl) = __xx.__i.__l;						\
1458     (xh) += ((((SItype) __m0 >> 31) & __m1)				\
1459 	     + (((SItype) __m1 >> 31) & __m0));				\
1460   } while (0)
1461 #define sdiv_qrnnd(q, r, n1, n0, d) \
1462   do {									\
1463     union {DItype __ll;							\
1464 	   struct {SItype __l, __h;} __i;				\
1465 	  } __xx;							\
1466     __xx.__i.__h = n1; __xx.__i.__l = n0;				\
1467     __asm__ ("ediv %3,%2,%0,%1"						\
1468 	     : "=g" (q), "=g" (r)					\
1469 	     : "g" (__xx.__ll), "g" (d));				\
1470   } while (0)
1471 #endif /* __vax__ */
1472 
1473 #ifdef _TMS320C6X
1474 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1475   do									\
1476     {									\
1477       UDItype __ll;							\
1478       __asm__ ("addu .l1 %1, %2, %0"					\
1479 	       : "=a" (__ll) : "a" (al), "a" (bl));			\
1480       (sl) = (USItype)__ll;						\
1481       (sh) = ((USItype)(__ll >> 32)) + (ah) + (bh);			\
1482     }									\
1483   while (0)
1484 
1485 #ifdef _TMS320C6400_PLUS
1486 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
1487 #define umul_ppmm(w1, w0, u, v)						\
1488   do {									\
1489     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
1490     (w1) = (USItype) (__x >> 32);					\
1491     (w0) = (USItype) (__x);						\
1492   } while (0)
1493 #endif  /* _TMS320C6400_PLUS */
1494 
1495 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
1496 #ifdef _TMS320C6400
1497 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
1498 #endif
1499 #define UMUL_TIME 4
1500 #define UDIV_TIME 40
1501 #endif /* _TMS320C6X */
1502 
1503 #if defined (__xtensa__) && W_TYPE_SIZE == 32
1504 /* This code is not Xtensa-configuration-specific, so rely on the compiler
1505    to expand builtin functions depending on what configuration features
1506    are available.  This avoids library calls when the operation can be
1507    performed in-line.  */
1508 #define umul_ppmm(w1, w0, u, v)						\
1509   do {									\
1510     DWunion __w;							\
1511     __w.ll = __builtin_umulsidi3 (u, v);				\
1512     w1 = __w.s.high;							\
1513     w0 = __w.s.low;							\
1514   } while (0)
1515 #define __umulsidi3(u, v)		__builtin_umulsidi3 (u, v)
1516 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
1517 #define count_trailing_zeros(COUNT, X)	((COUNT) = __builtin_ctz (X))
1518 #endif /* __xtensa__ */
1519 
1520 #if defined xstormy16
1521 extern UHItype __stormy16_count_leading_zeros (UHItype);
1522 #define count_leading_zeros(count, x)					\
1523   do									\
1524     {									\
1525       UHItype size;							\
1526 									\
1527       /* We assume that W_TYPE_SIZE is a multiple of 16...  */		\
1528       for ((count) = 0, size = W_TYPE_SIZE; size; size -= 16)		\
1529 	{								\
1530 	  UHItype c;							\
1531 									\
1532 	  c = __clzhi2 ((x) >> (size - 16));				\
1533 	  (count) += c;							\
1534 	  if (c != 16)							\
1535 	    break;							\
1536 	}								\
1537     }									\
1538   while (0)
1539 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
1540 #endif
1541 
1542 #if defined (__z8000__) && W_TYPE_SIZE == 16
1543 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1544   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
1545 	   : "=r" ((unsigned int)(sh)),					\
1546 	     "=&r" ((unsigned int)(sl))					\
1547 	   : "%0" ((unsigned int)(ah)),					\
1548 	     "r" ((unsigned int)(bh)),					\
1549 	     "%1" ((unsigned int)(al)),					\
1550 	     "rQR" ((unsigned int)(bl)))
1551 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1552   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
1553 	   : "=r" ((unsigned int)(sh)),					\
1554 	     "=&r" ((unsigned int)(sl))					\
1555 	   : "0" ((unsigned int)(ah)),					\
1556 	     "r" ((unsigned int)(bh)),					\
1557 	     "1" ((unsigned int)(al)),					\
1558 	     "rQR" ((unsigned int)(bl)))
1559 #define umul_ppmm(xh, xl, m0, m1) \
1560   do {									\
1561     union {long int __ll;						\
1562 	   struct {unsigned int __h, __l;} __i;				\
1563 	  } __xx;							\
1564     unsigned int __m0 = (m0), __m1 = (m1);				\
1565     __asm__ ("mult	%S0,%H3"					\
1566 	     : "=r" (__xx.__i.__h),					\
1567 	       "=r" (__xx.__i.__l)					\
1568 	     : "%1" (__m0),						\
1569 	       "rQR" (__m1));						\
1570     (xh) = __xx.__i.__h; (xl) = __xx.__i.__l;				\
1571     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
1572 	     + (((signed int) __m1 >> 15) & __m0));			\
1573   } while (0)
1574 #endif /* __z8000__ */
1575 
1576 #endif /* __GNUC__ */
1577 
1578 /* If this machine has no inline assembler, use C macros.  */
1579 
1580 #if !defined (add_ssaaaa)
1581 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1582   do {									\
1583     UWtype __x;								\
1584     __x = (al) + (bl);							\
1585     (sh) = (ah) + (bh) + (__x < (al));					\
1586     (sl) = __x;								\
1587   } while (0)
1588 #endif
1589 
1590 #if !defined (sub_ddmmss)
1591 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1592   do {									\
1593     UWtype __x;								\
1594     __x = (al) - (bl);							\
1595     (sh) = (ah) - (bh) - (__x > (al));					\
1596     (sl) = __x;								\
1597   } while (0)
1598 #endif
1599 
1600 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1601    smul_ppmm.  */
1602 #if !defined (umul_ppmm) && defined (smul_ppmm)
1603 #define umul_ppmm(w1, w0, u, v)						\
1604   do {									\
1605     UWtype __w1;							\
1606     UWtype __xm0 = (u), __xm1 = (v);					\
1607     smul_ppmm (__w1, w0, __xm0, __xm1);					\
1608     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
1609 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
1610   } while (0)
1611 #endif
1612 
1613 /* If we still don't have umul_ppmm, define it using plain C.  */
1614 #if !defined (umul_ppmm)
1615 #define umul_ppmm(w1, w0, u, v)						\
1616   do {									\
1617     UWtype __x0, __x1, __x2, __x3;					\
1618     UHWtype __ul, __vl, __uh, __vh;					\
1619 									\
1620     __ul = __ll_lowpart (u);						\
1621     __uh = __ll_highpart (u);						\
1622     __vl = __ll_lowpart (v);						\
1623     __vh = __ll_highpart (v);						\
1624 									\
1625     __x0 = (UWtype) __ul * __vl;					\
1626     __x1 = (UWtype) __ul * __vh;					\
1627     __x2 = (UWtype) __uh * __vl;					\
1628     __x3 = (UWtype) __uh * __vh;					\
1629 									\
1630     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
1631     __x1 += __x2;		/* but this indeed can */		\
1632     if (__x1 < __x2)		/* did we get it? */			\
1633       __x3 += __ll_B;		/* yes, add it in the proper pos.  */	\
1634 									\
1635     (w1) = __x3 + __ll_highpart (__x1);					\
1636     (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0);		\
1637   } while (0)
1638 #endif
1639 
1640 #if !defined (__umulsidi3)
1641 #define __umulsidi3(u, v) \
1642   ({DWunion __w;							\
1643     umul_ppmm (__w.s.high, __w.s.low, u, v);				\
1644     __w.ll; })
1645 #endif
1646 
1647 /* Define this unconditionally, so it can be used for debugging.  */
1648 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
1649   do {									\
1650     UWtype __d1, __d0, __q1, __q0;					\
1651     UWtype __r1, __r0, __m;						\
1652     __d1 = __ll_highpart (d);						\
1653     __d0 = __ll_lowpart (d);						\
1654 									\
1655     __r1 = (n1) % __d1;							\
1656     __q1 = (n1) / __d1;							\
1657     __m = (UWtype) __q1 * __d0;						\
1658     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
1659     if (__r1 < __m)							\
1660       {									\
1661 	__q1--, __r1 += (d);						\
1662 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
1663 	  if (__r1 < __m)						\
1664 	    __q1--, __r1 += (d);					\
1665       }									\
1666     __r1 -= __m;							\
1667 									\
1668     __r0 = __r1 % __d1;							\
1669     __q0 = __r1 / __d1;							\
1670     __m = (UWtype) __q0 * __d0;						\
1671     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
1672     if (__r0 < __m)							\
1673       {									\
1674 	__q0--, __r0 += (d);						\
1675 	if (__r0 >= (d))						\
1676 	  if (__r0 < __m)						\
1677 	    __q0--, __r0 += (d);					\
1678       }									\
1679     __r0 -= __m;							\
1680 									\
1681     (q) = (UWtype) __q1 * __ll_B | __q0;				\
1682     (r) = __r0;								\
1683   } while (0)
1684 
1685 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
1686    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
1687 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
1688 #define udiv_qrnnd(q, r, nh, nl, d) \
1689   do {									\
1690     extern UWtype __udiv_w_sdiv (UWtype *, UWtype, UWtype, UWtype);	\
1691     UWtype __r;								\
1692     (q) = __udiv_w_sdiv (&__r, nh, nl, d);				\
1693     (r) = __r;								\
1694   } while (0)
1695 #endif
1696 
1697 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
1698 #if !defined (udiv_qrnnd)
1699 #define UDIV_NEEDS_NORMALIZATION 1
1700 #define udiv_qrnnd __udiv_qrnnd_c
1701 #endif
1702 
1703 #if !defined (count_leading_zeros)
1704 #define count_leading_zeros(count, x) \
1705   do {									\
1706     UWtype __xr = (x);							\
1707     UWtype __a;								\
1708 									\
1709     if (W_TYPE_SIZE <= 32)						\
1710       {									\
1711 	__a = __xr < ((UWtype)1<<2*__BITS4)				\
1712 	  ? (__xr < ((UWtype)1<<__BITS4) ? 0 : __BITS4)			\
1713 	  : (__xr < ((UWtype)1<<3*__BITS4) ?  2*__BITS4 : 3*__BITS4);	\
1714       }									\
1715     else								\
1716       {									\
1717 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
1718 	  if (((__xr >> __a) & 0xff) != 0)				\
1719 	    break;							\
1720       }									\
1721 									\
1722     (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
1723   } while (0)
1724 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
1725 #endif
1726 
1727 #if !defined (count_trailing_zeros)
1728 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
1729    defined in asm, but if it is not, the C version above is good enough.  */
1730 #define count_trailing_zeros(count, x) \
1731   do {									\
1732     UWtype __ctz_x = (x);						\
1733     UWtype __ctz_c;							\
1734     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
1735     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
1736   } while (0)
1737 #endif
1738 
1739 #ifndef UDIV_NEEDS_NORMALIZATION
1740 #define UDIV_NEEDS_NORMALIZATION 0
1741 #endif
1742