1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ELEMENTS_TILE >= 1 7$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 8#include <assert.h> 9 10#include <xnnpack/common.h> 11#include <xnnpack/raddstoreexpminusmax.h> 12 13#include <fp16/bitcasts.h> 14 15 16// Note redefine as uint32[] to avoid redundant bitcasts. 17extern XNN_INTERNAL const uint32_t xnn_table_exp2_k_over_64[64]; 18 19void xnn_f32_raddstoreexpminusmax_ukernel__scalar_lut64_p2_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( 20 size_t elements, 21 const float* input, 22 float* output, 23 float* sum, 24 float vi_max) 25{ 26 assert(elements % sizeof(float) == 0); 27 28 const float vmagic_bias = 0x1.800000p23f; 29 // The smallest x for which expf(x) is normalized. 30 const float vdenorm_cutoff = -0x1.5D589Ep6f; 31 const float vlog2e_x64 = 0x1.715476p6f; 32 // Last 13 bits are zeroes 33 const float vminus_ln2_o64_hi = -0x1.630000p-7f; 34 const float vminus_ln2_o64_lo = 0x1.BD0106p-19f; 35 36 const float vc2 = 0x1.FFFF0Ap-2f; 37 38 const uint32_t vindex_mask = UINT32_C(0x3F); 39 40 $if ELEMENTS_TILE > 1: 41 $for K in range(ACCUMULATORS): 42 float vacc${K} = 0.0f; 43 for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) { 44 // Load ${ELEMENTS_TILE} inputs at a time. 45 $for N in range(ELEMENTS_TILE): 46 const float vi${N} = input[${N}]; 47 input += ${ELEMENTS_TILE}; 48 49 // Subtract maximum input x := i - i_max. This implies x <= 0. 50 $for N in range(ELEMENTS_TILE): 51 const float vx${N} = vi${N} - vi_max; 52 53 // Compute reduced argument n := round(x * 64 / log(2)). 54 // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing 55 // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. 56 // The trick with adding large number is valid only within certain bounds (|x * 64 / log(2)| <= 2**22, i.e. 57 // |x| <= 0x1.62E43p+15 = 45426.09375), but that is acceptable, because inputs outside of [-87.336540, 0.0] 58 // result in denormalized or underflown expf(x). We fixup the result for such inputs at the very end of the 59 // algorithm. 60 $for N in range(ELEMENTS_TILE): 61 float vn${N} = vx${N} * vlog2e_x64 + vmagic_bias; 62 63 // Create a floating-point number s (scale) such that s := 2**(n / 64) for such inputs that expf(x) is normalized, 64 // i.e. -87.33642 <= x <= 0.0. As n has 6 fractional bits, we split s == 2**(n / 64) = 2**e * 2**(n / 64 - e), where 65 // e := int(n / 64). We create s in two steps: 66 // 1. Fetch 2**(n / 64 - e) = 2**(n % 64) from the table using the 6 low bits of n, as integer. Note that the 67 // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. 68 // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized 69 // number, because for -87.33642 <= x <= 0.0 (inputs for which expf(x) is normalized) we have -126 <= e <= 0, 70 // and thus the adjusted exponent is not lower than -126. 71 // 72 // Extract e from bits 6:14 of n and shift it into bits 23:31 (position of floating-point exponent). 73 $for N in range(ELEMENTS_TILE): 74 const uint32_t ve${N} = (fp32_to_bits(vn${N}) & UINT32_C(0xFFFFFFC0)) << 17; 75 76 // Use bits 0:6 bits of n, as integer, as an index for table lookup of l := 2**(n % 64). 77 $for N in range(ELEMENTS_TILE): 78 const uint32_t vidx${N} = fp32_to_bits(vn${N}) & vindex_mask; 79 // Adjust exponent of the value l fetched from the table to get the final s value. 80 $for N in range(ELEMENTS_TILE): 81 const float vs${N} = fp32_from_bits(xnn_table_exp2_k_over_64[vidx${N}] + ve${N}); 82 83 // Subtract the large number back to get final n := round(x * 64 / log(2)) as a floating-point number. 84 $for N in range(ELEMENTS_TILE): 85 vn${N} -= vmagic_bias; 86 87 // Compute reduced argument t := x - n * log(2) / 64. 88 // Use Cody-Waite range reduction method (note the two constants representing log(2) / 64) to improve accuracy. 89 $for N in range(ELEMENTS_TILE): 90 float vt${N} = vn${N} * vminus_ln2_o64_hi + vx${N}; 91 92 $for N in range(ELEMENTS_TILE): 93 vt${N} = vn${N} * vminus_ln2_o64_lo + vt${N}; 94 95 // Compute degree-2 polynomial approximation for exp(t) on [-log(2)/128, log(2)/128]. 96 $for N in range(ELEMENTS_TILE): 97 float vp${N} = vt${N} * vc2; 98 99 $for N in range(ELEMENTS_TILE): 100 vp${N} = vp${N} * vt${N} + vt${N}; 101 102 // Reconstruct the final f value: 103 // f = s * (1 + t * (1 + t * c2)) 104 // = s * (1 + t + t * (t * c2)) 105 // = s + s * (t + t * (t * c2)) 106 // = s + s * p 107 $for N in range(ELEMENTS_TILE): 108 float vf${N} = vp${N} * vs${N} + vs${N}; 109 110 // For inputs below denormal cutoff, replace output with +0.0f. 111 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 112 $for N in range(ELEMENTS_TILE): 113 if XNN_UNPREDICTABLE(vx${N} < vdenorm_cutoff) { 114 vf${N} = 0.0f; 115 } 116 117 // Store ${ELEMENTS_TILE} outputs at a time. 118 $for N in range(ELEMENTS_TILE): 119 output[${N}] = vf${N}; 120 output += ${ELEMENTS_TILE}; 121 122 // Accumulate computed exponents. 123 $for N in range(ELEMENTS_TILE): 124 vacc${N % ACCUMULATORS} += vf${N}; 125 } 126 $if ACCUMULATORS > 1: 127 // Add up all accumulators to vacc0 128 $ACC_SLICE = 1 129 $while ACC_SLICE < ACCUMULATORS: 130 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 131 $if A + ACC_SLICE < ACCUMULATORS: 132 vacc${A} += vacc${A + ACC_SLICE}; 133 $ACC_SLICE *= 2 134 135 float vacc = vacc0; 136 $else: 137 float vacc = 0.0f; 138 for (; elements >= sizeof(float); elements -= sizeof(float)) { 139 // Load 1 input at a time. 140 const float vi = *input++; 141 142 // Subtract maximum input x := i - i_max. This implies x <= 0. 143 const float vx = vi - vi_max; 144 145 // Compute reduced argument n := round(x * 64 / log(2)). 146 // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing 147 // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. 148 // The trick with adding large number is valid only within certain bounds (|x * 64 / log(2)| <= 2**22, i.e. 149 // |x| <= 0x1.62E43p+15 = 45426.09375), but that is acceptable, because inputs outside of [-87.336540, 0.0] 150 // result in denormalized or underflown expf(x). We fixup the result for such inputs at the very end of the 151 // algorithm. 152 float vn = vx * vlog2e_x64 + vmagic_bias; 153 154 // Create a floating-point number s (scale) such that s := 2**(n / 64) for such inputs that expf(x) is normalized, 155 // i.e. -87.33642 <= x <= 0.0. As n has 6 fractional bits, we split s == 2**(n / 64) = 2**e * 2**(n / 64 - e), where 156 // e := int(n / 64). We create s in two steps: 157 // 1. Fetch 2**(n / 64 - e) = 2**(n % 64) from the table using the 6 low bits of n, as integer. Note that the 158 // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. 159 // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized 160 // number, because for -87.33642 <= x <= 0.0 (inputs for which expf(x) is normalized) we have -126 <= e <= 0, 161 // and thus the adjusted exponent is not lower than -126. 162 // 163 // Extract e from bits 6:14 of n and shift it into bits 23:31 (position of floating-point exponent). 164 const uint32_t ve = (fp32_to_bits(vn) & UINT32_C(0xFFFFFFC0)) << 17; 165 166 // Use bits 0:6 bits of n, as integer, as an index for table lookup of l := 2**(n % 64). 167 const uint32_t vidx = fp32_to_bits(vn) & vindex_mask; 168 // Adjust exponent of the value l fetched from the table to get the final s value. 169 const float vs = fp32_from_bits(xnn_table_exp2_k_over_64[vidx] + ve); 170 171 // Subtract the large number back to get final n := round(x * 64 / log(2)) as a floating-point number. 172 vn -= vmagic_bias; 173 174 // Compute reduced argument t := x - n * log(2) / 64. 175 // Use Cody-Waite range reduction method (note the two constants representing log(2) / 64) to improve accuracy. 176 float vt = vn * vminus_ln2_o64_hi + vx; 177 vt = vn * vminus_ln2_o64_lo + vt; 178 179 // Compute degree-2 polynomial approximation for exp(t) on [-log(2)/128, log(2)/128]. 180 float vp = vt * vc2; 181 vp = vp * vt + vt; 182 183 // Reconstruct the final f value: 184 // f = s * (1 + t * (1 + t * c2)) 185 // = s * (1 + t + t * (t * c2)) 186 // = s + s * (t + t * (t * c2)) 187 // = s + s * p 188 float vf = vp * vs + vs; 189 190 // For inputs below denormal cutoff, replace output with +0.0f. 191 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 192 if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) { 193 vf = 0.0f; 194 } 195 196 // Store 1 output at a time. 197 *output++ = vf; 198 199 // Accumulate computed exponents. 200 vacc += vf; 201 } 202 *sum = vacc; 203} 204