1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/wasmsimd-pipelined.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <wasm_simd128.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_pipelined_x2(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_pipelined_x2(
18 size_t mc,
19 size_t nc,
20 const float*restrict input,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict output,
25 size_t output_stride,
26 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(mc != 0);
29 assert(mc % sizeof(float) == 0);
30 assert(nc != 0);
31
32 const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
33 const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
34 size_t output_decrement = output_stride * nc - 16 * sizeof(float);
35 while XNN_LIKELY(mc >= 16 * sizeof(float)) {
36 const float*restrict w = weights;
37 const int32_t* dmap = widx_dmap;
38 const uint32_t* nnzmap = nidx_nnzmap;
39 v128_t vw = wasm_v32x4_load_splat(w); w += 1;
40 intptr_t diff = *dmap++;
41 v128_t vi0123 = wasm_v128_load(input + 0);
42 v128_t vi4567 = wasm_v128_load(input + 4);
43 v128_t vi89AB = wasm_v128_load(input + 8);
44 v128_t viCDEF = wasm_v128_load(input + 12);
45 size_t n = nc;
46 do {
47 uint32_t nnz = *nnzmap++;
48 v128_t vacc0123 = vw;
49 v128_t vacc4567 = vw;
50 v128_t vacc89AB = vw;
51 v128_t vaccCDEF = vw;
52 vw = wasm_v32x4_load_splat(w); w += 1;
53
54 for (; nnz >= 2; nnz -= 2) {
55 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
56 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
57 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
58 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
59 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
60 diff = *dmap++;
61 vw = wasm_v32x4_load_splat(w); w += 1;
62 vi0123 = wasm_v128_load(input + 0);
63 vi4567 = wasm_v128_load(input + 4);
64 vi89AB = wasm_v128_load(input + 8);
65 viCDEF = wasm_v128_load(input + 12);
66 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
67 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
68 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
69 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
70 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
71 diff = *dmap++;
72 vw = wasm_v32x4_load_splat(w); w += 1;
73 vi0123 = wasm_v128_load(input + 0);
74 vi4567 = wasm_v128_load(input + 4);
75 vi89AB = wasm_v128_load(input + 8);
76 viCDEF = wasm_v128_load(input + 12);
77 }
78
79 if XNN_LIKELY(nnz != 0) {
80 do {
81 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
82 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
83 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
84 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
85 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
86
87 diff = *dmap++;
88 vw = wasm_v32x4_load_splat(w); w += 1;
89 vi0123 = wasm_v128_load(input + 0);
90 vi4567 = wasm_v128_load(input + 4);
91 vi89AB = wasm_v128_load(input + 8);
92 viCDEF = wasm_v128_load(input + 12);
93 } while (--nnz != 0);
94 }
95 v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
96 v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
97 v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
98 v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
99 vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
100 vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
101 vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
102 voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
103 wasm_v128_store(output, vout0123);
104 wasm_v128_store(output + 4, vout4567);
105 wasm_v128_store(output + 8, vout89AB);
106 wasm_v128_store(output + 12, voutCDEF);
107 output = (float*restrict) ((uintptr_t) output + output_stride);
108 } while (--n != 0);
109 output = (float*restrict) ((uintptr_t) output - output_decrement);
110 input += 16;
111 mc -= 16 * sizeof(float);
112 }
113 if XNN_UNLIKELY(mc != 0) {
114 output_decrement += 8 * sizeof(float);
115 if (mc & (8 * sizeof(float))) {
116 const float*restrict w = weights;
117 const int32_t* dmap = widx_dmap;
118 const uint32_t* nnzmap = nidx_nnzmap;
119 size_t n = nc;
120 do {
121 uint32_t nnz = *nnzmap++;
122 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
123 v128_t vacc4567 = vacc0123;
124 if XNN_LIKELY(nnz != 0) {
125 do {
126 const intptr_t diff = *dmap++;
127 const v128_t vi0123 = wasm_v128_load(input);
128 const v128_t vi4567 = wasm_v128_load(input + 4);
129 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
130 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
131 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
132 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
133 } while (--nnz != 0);
134 }
135 v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
136 v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
137 vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
138 vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
139 wasm_v128_store(output, vout0123);
140
141 wasm_v128_store(output + 4, vout4567);
142 output = (float*restrict) ((uintptr_t) output + output_stride);
143 } while (--n != 0);
144 output = (float*restrict) ((uintptr_t) output - output_decrement);
145 input += 8;
146 }
147 output_decrement += 4 * sizeof(float);
148 if (mc & (4 * sizeof(float))) {
149 const float*restrict w = weights;
150 const int32_t* dmap = widx_dmap;
151 const uint32_t* nnzmap = nidx_nnzmap;
152 size_t n = nc;
153 do {
154 uint32_t nnz = *nnzmap++;
155 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
156 if XNN_LIKELY(nnz != 0) {
157 do {
158 const intptr_t diff = *dmap++;
159 const v128_t vi0123 = wasm_v128_load(input);
160 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
161 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
162 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
163 } while (--nnz != 0);
164 }
165 v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
166 vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
167 wasm_v128_store(output, vout0123);
168
169 output = (float*restrict) ((uintptr_t) output + output_stride);
170 } while (--n != 0);
171 output = (float*restrict) ((uintptr_t) output - output_decrement);
172 input += 4;
173 }
174 output_decrement += 2 * sizeof(float);
175 if (mc & (2 * sizeof(float))) {
176 const float*restrict w = weights;
177 const int32_t* dmap = widx_dmap;
178 const uint32_t* nnzmap = nidx_nnzmap;
179 size_t n = nc;
180 do {
181 uint32_t nnz = *nnzmap++;
182 v128_t vacc01 = wasm_v32x4_load_splat(w); w += 1;
183 if XNN_LIKELY(nnz != 0) {
184 do {
185 const intptr_t diff = *dmap++;
186 const v128_t vi01 = wasm_v64x2_load_splat(input);
187 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
188 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
189 vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
190 } while (--nnz != 0);
191 }
192 v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
193 vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
194 *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
195
196 output = (float*restrict) ((uintptr_t) output + output_stride);
197 } while (--n != 0);
198 output = (float*restrict) ((uintptr_t) output - output_decrement);
199 input += 2;
200 }
201 output_decrement += 1 * sizeof(float);
202 if (mc & (1 * sizeof(float))) {
203 const float*restrict w = weights;
204 const int32_t* dmap = widx_dmap;
205 const uint32_t* nnzmap = nidx_nnzmap;
206 size_t n = nc;
207 do {
208 uint32_t nnz = *nnzmap++;
209 v128_t vacc0 = wasm_v32x4_load_splat(w); w += 1;
210 if XNN_LIKELY(nnz != 0) {
211 do {
212 const intptr_t diff = *dmap++;
213 const v128_t vi0 = wasm_v32x4_load_splat(input);
214 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
215 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
216 vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
217 } while (--nnz != 0);
218 }
219 v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
220 vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
221 *output = wasm_f32x4_extract_lane(vout0, 0);
222
223 output = (float*restrict) ((uintptr_t) output + output_stride);
224 } while (--n != 0);
225 output = (float*restrict) ((uintptr_t) output - output_decrement);
226 input += 1;
227 }
228 }
229 }
230