1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert BATCH_TILE >= 1
7$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
8#include <assert.h>
9#include <math.h>
10
11#include <xnnpack/common.h>
12#include <xnnpack/vunary.h>
13
14#include <fp16/bitcasts.h>
15
16
17// Note redefine as uint32[] to avoid redundant bitcasts.
18extern XNN_INTERNAL const uint32_t xnn_table_exp2minus_k_over_64[64];
19
20void xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x${BATCH_TILE}(
21    size_t n,
22    const float* x,
23    float* y,
24    const void* params)
25{
26  assert(n % sizeof(float) == 0);
27
28  const float vmagic_bias = 0x1.800000p17f;
29  const float vminus_log2e = -0x1.715476p0f;
30  const uint32_t vindex_mask = UINT32_C(0x3F);
31  const float vln2_hi =  0x1.630000p-1f;
32  const float vln2_lo = -0x1.BD0106p-13f;
33  const float vc2 = 0x1.FFFF0Ap-2f;
34  const float vone = 1.0f;
35  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
36
37  $if BATCH_TILE > 1:
38    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
39      $for N in range(BATCH_TILE):
40        const float vx${N} = x[${N}];
41      x += ${BATCH_TILE};
42
43      $for N in range(BATCH_TILE):
44        const float vz${N} = fabsf(vx${N});
45
46      $for N in range(BATCH_TILE):
47        float vn${N} = vz${N} * vminus_log2e + vmagic_bias;
48
49      $for N in range(BATCH_TILE):
50        const uint32_t ve${N} = fp32_to_bits(vn${N}) << 17;
51
52      $for N in range(BATCH_TILE):
53        const uint32_t vidx${N} = fp32_to_bits(vn${N}) & vindex_mask;
54        const float vs${N} = fp32_from_bits(xnn_table_exp2minus_k_over_64[vidx${N}] + ve${N});
55
56      $for N in range(BATCH_TILE):
57        vn${N} -= vmagic_bias;
58
59      $for N in range(BATCH_TILE):
60        float vt${N} = vn${N} * vln2_hi + vz${N};
61
62      $for N in range(BATCH_TILE):
63        vt${N} = vn${N} * vln2_lo + vt${N};
64
65      $for N in range(BATCH_TILE):
66        float vp${N} = vt${N} * vc2;
67
68      $for N in range(BATCH_TILE):
69        vp${N} = vt${N} - vp${N} * vt${N};
70
71      $for N in range(BATCH_TILE):
72        const float vy${N} = vs${N} - vs${N} * vp${N};
73
74      $for N in range(BATCH_TILE):
75        const float vd${N} = vy${N} + vone;
76
77      $for N in range(BATCH_TILE):
78        float vf${N} = vy${N} / vd${N};
79
80      $for N in range(BATCH_TILE):
81        if XNN_UNPREDICTABLE(vz${N} > vdenorm_cutoff) {
82          vf${N} = 0.0f;
83        }
84
85      $for N in range(BATCH_TILE):
86        if XNN_UNPREDICTABLE(vx${N} > 0.0f) {
87          vf${N} = vone - vf${N};
88        }
89
90      $for N in range(BATCH_TILE):
91        y[${N}] = vf${N};
92      y += ${BATCH_TILE};
93    }
94  $if BATCH_TILE == 1:
95    do {
96      const float vx = *x++;
97
98      const float vz = fabsf(vx);
99
100      float vn = vz * vminus_log2e + vmagic_bias;
101      const uint32_t ve = fp32_to_bits(vn) << 17;
102      const uint32_t vidx = fp32_to_bits(vn) & vindex_mask;
103      const float vs = fp32_from_bits(xnn_table_exp2minus_k_over_64[vidx] + ve);
104      vn -= vmagic_bias;
105
106      float vt = vn * vln2_hi + vz;
107      vt = vn * vln2_lo + vt;
108
109      float vp = vt * vc2;
110      vp = vt - vp * vt;
111
112      const float vy = vs - vs * vp;
113      const float vd = vy + vone;
114
115      float vf = vy / vd;
116      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
117        vf = 0.0f;
118      }
119      if XNN_UNPREDICTABLE(vx > 0.0f) {
120        vf = vone - vf;
121      }
122
123      *y++ = vf;
124
125      n -= sizeof(float);
126    } while (n != 0);
127  $elif BATCH_TILE == 2:
128    if XNN_UNLIKELY(n != 0) {
129      const float vx = *x;
130
131      const float vz = fabsf(vx);
132
133      float vn = vz * vminus_log2e + vmagic_bias;
134      const uint32_t ve = fp32_to_bits(vn) << 17;
135      const uint32_t vidx = fp32_to_bits(vn) & vindex_mask;
136      const float vs = fp32_from_bits(xnn_table_exp2minus_k_over_64[vidx] + ve);
137      vn -= vmagic_bias;
138
139      float vt = vn * vln2_hi + vz;
140      vt = vn * vln2_lo + vt;
141
142      float vp = vt * vc2;
143      vp = vt - vp * vt;
144
145      const float vy = vs - vs * vp;
146      const float vd = vy + vone;
147
148      float vf = vy / vd;
149      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
150        vf = 0.0f;
151      }
152      if XNN_UNPREDICTABLE(vx > 0.0f) {
153        vf = vone - vf;
154      }
155
156      *y = vf;
157    }
158  $else:
159    if XNN_UNLIKELY(n != 0) {
160      do {
161        const float vx = *x++;
162
163        const float vz = fabsf(vx);
164
165        float vn = vz * vminus_log2e + vmagic_bias;
166        const uint32_t ve = fp32_to_bits(vn) << 17;
167        const uint32_t vidx = fp32_to_bits(vn) & vindex_mask;
168        const float vs = fp32_from_bits(xnn_table_exp2minus_k_over_64[vidx] + ve);
169        vn -= vmagic_bias;
170
171        float vt = vn * vln2_hi + vz;
172        vt = vn * vln2_lo + vt;
173
174        float vp = vt * vc2;
175        vp = vt - vp * vt;
176
177        const float vy = vs - vs * vp;
178        const float vd = vy + vone;
179
180        float vf = vy / vd;
181        if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
182          vf = 0.0f;
183        }
184        if XNN_UNPREDICTABLE(vx > 0.0f) {
185          vf = vone - vf;
186        }
187
188        *y++ = vf;
189
190        n -= sizeof(float);
191      } while (n != 0);
192    }
193}
194