1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert CHANNEL_TILE % 8 == 0
7$assert CHANNEL_TILE >= 8
8$assert ROW_TILE >= 2
9$assert ACCUMULATORS >= 1
10$assert ROW_TILE >= ACCUMULATORS * 2
11$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
12#include <assert.h>
13
14#include <wasm_simd128.h>
15
16#include <xnnpack/gavgpool.h>
17
18
19void xnn_qs8_gavgpool_minmax_ukernel_${ROW_TILE}x__wasmsimd_c${CHANNEL_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
20    size_t rows,
21    size_t channels,
22    const int8_t* input,
23    size_t input_stride,
24    const int8_t* zero,
25    int8_t* output,
26    const union xnn_qs8_avgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
27{
28  assert(rows != 0);
29  assert(rows <= ${ROW_TILE});
30  assert(channels != 0);
31
32  const int8_t* i0 = input;
33  $for M in range(1, ROW_TILE):
34    const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
35    $if M % 2 == 1:
36      if XNN_UNPREDICTABLE(rows < ${M+1}) {
37        i${M} = zero;
38      }
39    $else:
40      if XNN_UNPREDICTABLE(rows <= ${M}) {
41        i${M} = zero;
42      }
43
44  const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
45  const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier);
46  const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
47  const int32_t vshift = params->wasmsimd.shift;
48  const v128_t vzero = wasm_f64x2_splat(0.0);
49  while (channels >= ${CHANNEL_TILE}) {
50    $for M in range(ROW_TILE):
51      const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
52      $for C in range(8, CHANNEL_TILE, 8):
53        const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C});
54      i${M} += ${CHANNEL_TILE};
55
56    $for A in range(ACCUMULATORS):
57      $for C in range(0, CHANNEL_TILE, 8):
58        v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]});
59
60    $for M in range(ACCUMULATORS * 2, ROW_TILE):
61      $for C in range(0, CHANNEL_TILE, 8):
62        vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${M}x${ABC[C:C+8]});
63
64    $if ACCUMULATORS > 1:
65      // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
66      $ACC_SLICE = 1
67      $while ACC_SLICE < ACCUMULATORS:
68        $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
69          $if A + ACC_SLICE < ACCUMULATORS:
70            $for C in range(0, CHANNEL_TILE, 8):
71              vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
72        $ACC_SLICE *= 2
73
74    $for C in range(0, CHANNEL_TILE, 8):
75      const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+8]}));
76      const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C:C+8]}));
77
78    $for C in range(0, CHANNEL_TILE, 4):
79      const v128_t vabsacc${ABC[C:C+4]} = wasm_i32x4_abs(vacc${ABC[C:C+4]});
80
81    $for C in range(0, CHANNEL_TILE, 4):
82      const v128_t vsgnacc${ABC[C:C+4]} = wasm_i32x4_gt(vabsacc${ABC[C:C+4]}, vacc${ABC[C:C+4]});
83
84    $for C in range(0, CHANNEL_TILE, 4):
85      const v128_t vabsacc${ABC[C:C+2]} = wasm_v32x4_shuffle(vabsacc${ABC[C:C+4]}, vzero, 0, 4, 1, 5);
86      const v128_t vabsacc${ABC[C+2:C+4]} = wasm_v32x4_shuffle(vabsacc${ABC[C:C+4]}, vzero, 2, 6, 3, 7);
87
88    $for C in range(0, CHANNEL_TILE, 4):
89      const v128_t vabsprod${ABC[C:C+2]} = wasm_i64x2_mul(vabsacc${ABC[C:C+2]}, vmultiplier);
90      const v128_t vabsprod${ABC[C+2:C+4]} = wasm_i64x2_mul(vabsacc${ABC[C+2:C+4]}, vmultiplier);
91
92    $for C in range(0, CHANNEL_TILE, 2):
93      const v128_t vabsout${ABC[C:C+2]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[C:C+2]}, vrounding), vshift);
94
95    $for C in range(0, CHANNEL_TILE, 4):
96      const v128_t vabsout${ABC[C:C+4]} = wasm_v32x4_shuffle(vabsout${ABC[C:C+2]}, vabsout${ABC[C+2:C+4]}, 0, 2, 4, 6);
97
98    $for C in range(0, CHANNEL_TILE, 4):
99      const v128_t vout${ABC[C:C+4]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[C:C+4]}, vsgnacc${ABC[C:C+4]}), vsgnacc${ABC[C:C+4]});
100
101    const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
102    $for C in range(0, CHANNEL_TILE, 8):
103      const v128_t vout${ABC[C:C+8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vout${ABC[C:C+4]}, vout${ABC[C+4:C+8]}), voutput_zero_point);
104
105    const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
106    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
107    $for C in range(0, CHANNEL_TILE, 16):
108      $if C + 8 < CHANNEL_TILE:
109        const v128_t vout${ABC[C:C+16]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}), voutput_min), voutput_max);
110      $else:
111        const v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}), voutput_min), voutput_max);
112
113    $if CHANNEL_TILE > 8:
114      wasm_v128_store(output, vout${ABC[0:16]});
115    $else:
116      *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
117    $for C in range(16, CHANNEL_TILE, 16):
118      $if C + 8 < CHANNEL_TILE:
119        wasm_v128_store(output + ${C}, vout${ABC[C:C+16]});
120      $else:
121        *((double*) (output + ${C})) = wasm_f64x2_extract_lane(vout${ABC[C:C+8]}${ABC[C:C+8]}, 0);
122    output += ${CHANNEL_TILE};
123
124    channels -= ${CHANNEL_TILE};
125  }
126  if XNN_UNLIKELY(channels != 0) {
127    ${"do " if CHANNEL_TILE > 8 else ""}{
128      $for M in range(ROW_TILE):
129        const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
130        i${M} += 8;
131
132      $for A in range(ACCUMULATORS):
133        v128_t vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vxi${A*2}x${ABC[0:8]}, vxi${A*2+1}x${ABC[0:8]});
134
135      $for M in range(ACCUMULATORS * 2, ROW_TILE):
136        vacc${M % ACCUMULATORS}x${ABC[0:8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vxi${M}x${ABC[0:8]});
137
138      $if ACCUMULATORS > 1:
139        // Add up all accumulators to vacc0x${ABC[0:8]}
140        $ACC_SLICE = 1
141        $while ACC_SLICE < ACCUMULATORS:
142          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
143            $if A + ACC_SLICE < ACCUMULATORS:
144              vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]});
145          $ACC_SLICE *= 2
146
147      const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[0:8]}));
148      const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[0:8]}));
149
150      const v128_t vabsacc${ABC[0:4]} = wasm_i32x4_abs(vacc${ABC[0:4]});
151      const v128_t vabsacc${ABC[4:8]} = wasm_i32x4_abs(vacc${ABC[4:8]});
152
153      const v128_t vsgnacc${ABC[0:4]} = wasm_i32x4_gt(vabsacc${ABC[0:4]}, vacc${ABC[0:4]});
154      const v128_t vsgnacc${ABC[4:8]} = wasm_i32x4_gt(vabsacc${ABC[4:8]}, vacc${ABC[4:8]});
155
156      const v128_t vabsacc${ABC[0:2]} = wasm_v32x4_shuffle(vabsacc${ABC[0:4]}, vzero, 0, 4, 1, 5);
157      const v128_t vabsacc${ABC[2:4]} = wasm_v32x4_shuffle(vabsacc${ABC[0:4]}, vzero, 2, 6, 3, 7);
158      const v128_t vabsacc${ABC[4:6]} = wasm_v32x4_shuffle(vabsacc${ABC[4:8]}, vzero, 0, 4, 1, 5);
159      const v128_t vabsacc${ABC[6:8]} = wasm_v32x4_shuffle(vabsacc${ABC[4:8]}, vzero, 2, 6, 3, 7);
160
161      const v128_t vabsprod${ABC[0:2]} = wasm_i64x2_mul(vabsacc${ABC[0:2]}, vmultiplier);
162      const v128_t vabsprod${ABC[2:4]} = wasm_i64x2_mul(vabsacc${ABC[2:4]}, vmultiplier);
163      const v128_t vabsprod${ABC[4:6]} = wasm_i64x2_mul(vabsacc${ABC[4:6]}, vmultiplier);
164      const v128_t vabsprod${ABC[6:8]} = wasm_i64x2_mul(vabsacc${ABC[6:8]}, vmultiplier);
165
166      const v128_t vabsout${ABC[0:2]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[0:2]}, vrounding), vshift);
167      const v128_t vabsout${ABC[2:4]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[2:4]}, vrounding), vshift);
168      const v128_t vabsout${ABC[4:6]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[4:6]}, vrounding), vshift);
169      const v128_t vabsout${ABC[6:8]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[6:8]}, vrounding), vshift);
170
171      const v128_t vabsout${ABC[0:4]} = wasm_v32x4_shuffle(vabsout${ABC[0:2]}, vabsout${ABC[2:4]}, 0, 2, 4, 6);
172      const v128_t vabsout${ABC[4:8]} = wasm_v32x4_shuffle(vabsout${ABC[4:6]}, vabsout${ABC[6:8]}, 0, 2, 4, 6);
173
174      const v128_t vout${ABC[0:4]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[0:4]}, vsgnacc${ABC[0:4]}), vsgnacc${ABC[0:4]});
175      const v128_t vout${ABC[4:8]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[4:8]}, vsgnacc${ABC[4:8]}), vsgnacc${ABC[4:8]});
176
177      const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
178      const v128_t vout${ABC[0:8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vout${ABC[0:4]}, vout${ABC[4:8]}), voutput_zero_point);
179
180      const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
181      const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
182      v128_t vout${ABC[0:8]}${ABC[0:8]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[0:8]}, vout${ABC[0:8]}), voutput_min), voutput_max);
183
184      $if CHANNEL_TILE > 8:
185        if XNN_LIKELY(channels >= 8) {
186          *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
187          output += 8;
188          channels -= 8;
189        } else {
190          if (channels & 4) {
191            *((float*) output) = wasm_f32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
192            vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32);
193            output += 4;
194          }
195          if (channels & 2) {
196            *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
197            vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16);
198            output += 2;
199          }
200          if (channels & 1) {
201            *output = (int8_t) wasm_i8x16_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
202            output += 1;
203          }
204          channels = 0;
205        }
206      $else:
207        if (channels & 4) {
208          *((float*) output) = wasm_f32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
209          vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32);
210          output += 4;
211        }
212        if (channels & 2) {
213          *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
214          vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16);
215          output += 2;
216        }
217        if (channels & 1) {
218          *output = (int8_t) wasm_i8x16_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
219        }
220    }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
221  }
222}
223