1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/neon-blocked.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/spmm.h>
15 
16 
xnn_f32_spmm_minmax_ukernel_16x4__neonfma(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_f32_spmm_minmax_ukernel_16x4__neonfma(
18     size_t mc,
19     size_t nc,
20     const float*restrict input,
21     const float*restrict weights,
22     const int32_t*restrict widx_dmap,
23     const uint32_t*restrict nidx_nnzmap,
24     float*restrict output,
25     size_t output_stride,
26     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(mc != 0);
29   assert(mc % sizeof(float) == 0);
30   assert(nc != 0);
31 
32   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
33   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
34   size_t output_decrement = output_stride * nc - 16 * sizeof(float);
35   while XNN_LIKELY(mc >= 16 * sizeof(float)) {
36     const float*restrict w = weights;
37     const int32_t* dmap = widx_dmap;
38     const uint32_t* nnzmap = nidx_nnzmap;
39     size_t n = nc;
40     while (n >= 4) {
41       uint32_t nnz = *nnzmap++;
42       float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
43       float32x4_t vacc4567n0 = vacc0123n0;
44       float32x4_t vacc89ABn0 = vacc0123n0;
45       float32x4_t vaccCDEFn0 = vacc0123n0;
46       float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
47       float32x4_t vacc4567n1 = vacc0123n1;
48       float32x4_t vacc89ABn1 = vacc0123n1;
49       float32x4_t vaccCDEFn1 = vacc0123n1;
50       float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
51       float32x4_t vacc4567n2 = vacc0123n2;
52       float32x4_t vacc89ABn2 = vacc0123n2;
53       float32x4_t vaccCDEFn2 = vacc0123n2;
54       float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
55       float32x4_t vacc4567n3 = vacc0123n3;
56       float32x4_t vacc89ABn3 = vacc0123n3;
57       float32x4_t vaccCDEFn3 = vacc0123n3;
58       if XNN_LIKELY(nnz != 0) {
59         do {
60           const intptr_t diff = *dmap++;
61           const float32x4_t vi0123 = vld1q_f32(input);
62           const float32x4_t vi4567 = vld1q_f32(input + 4);
63           const float32x4_t vi89AB = vld1q_f32(input + 8);
64           const float32x4_t viCDEF = vld1q_f32(input + 12);
65           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
66           __builtin_prefetch(input + 16);
67           const float32x4_t vw = vld1q_f32(w); w += 4;
68           __builtin_prefetch(w + 32);
69           vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
70           vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
71           vacc89ABn0 = vfmaq_laneq_f32(vacc89ABn0, vi89AB, vw, 0);
72           vaccCDEFn0 = vfmaq_laneq_f32(vaccCDEFn0, viCDEF, vw, 0);
73           vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
74           vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
75           vacc89ABn1 = vfmaq_laneq_f32(vacc89ABn1, vi89AB, vw, 1);
76           vaccCDEFn1 = vfmaq_laneq_f32(vaccCDEFn1, viCDEF, vw, 1);
77           vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
78           vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
79           vacc89ABn2 = vfmaq_laneq_f32(vacc89ABn2, vi89AB, vw, 2);
80           vaccCDEFn2 = vfmaq_laneq_f32(vaccCDEFn2, viCDEF, vw, 2);
81           vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
82           vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
83           vacc89ABn3 = vfmaq_laneq_f32(vacc89ABn3, vi89AB, vw, 3);
84           vaccCDEFn3 = vfmaq_laneq_f32(vaccCDEFn3, viCDEF, vw, 3);
85         } while (--nnz != 0);
86       }
87       float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
88       float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
89       float32x4_t vout89ABn0 = vminq_f32(vacc89ABn0, vmax);
90       float32x4_t voutCDEFn0 = vminq_f32(vaccCDEFn0, vmax);
91       float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
92       float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
93       float32x4_t vout89ABn1 = vminq_f32(vacc89ABn1, vmax);
94       float32x4_t voutCDEFn1 = vminq_f32(vaccCDEFn1, vmax);
95       float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
96       float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
97       float32x4_t vout89ABn2 = vminq_f32(vacc89ABn2, vmax);
98       float32x4_t voutCDEFn2 = vminq_f32(vaccCDEFn2, vmax);
99       float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
100       float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
101       float32x4_t vout89ABn3 = vminq_f32(vacc89ABn3, vmax);
102       float32x4_t voutCDEFn3 = vminq_f32(vaccCDEFn3, vmax);
103 
104       vout0123n0 = vmaxq_f32(vout0123n0, vmin);
105       vout4567n0 = vmaxq_f32(vout4567n0, vmin);
106       vout89ABn0 = vmaxq_f32(vout89ABn0, vmin);
107       voutCDEFn0 = vmaxq_f32(voutCDEFn0, vmin);
108       vout0123n1 = vmaxq_f32(vout0123n1, vmin);
109       vout4567n1 = vmaxq_f32(vout4567n1, vmin);
110       vout89ABn1 = vmaxq_f32(vout89ABn1, vmin);
111       voutCDEFn1 = vmaxq_f32(voutCDEFn1, vmin);
112       vout0123n2 = vmaxq_f32(vout0123n2, vmin);
113       vout4567n2 = vmaxq_f32(vout4567n2, vmin);
114       vout89ABn2 = vmaxq_f32(vout89ABn2, vmin);
115       voutCDEFn2 = vmaxq_f32(voutCDEFn2, vmin);
116       vout0123n3 = vmaxq_f32(vout0123n3, vmin);
117       vout4567n3 = vmaxq_f32(vout4567n3, vmin);
118       vout89ABn3 = vmaxq_f32(vout89ABn3, vmin);
119       voutCDEFn3 = vmaxq_f32(voutCDEFn3, vmin);
120 
121       vst1q_f32(output + 0, vout0123n0);
122       vst1q_f32(output + 4, vout4567n0);
123       vst1q_f32(output + 8, vout89ABn0);
124       vst1q_f32(output + 12, voutCDEFn0);
125       output = (float*restrict) ((uintptr_t) output + output_stride);
126       vst1q_f32(output + 0, vout0123n1);
127       vst1q_f32(output + 4, vout4567n1);
128       vst1q_f32(output + 8, vout89ABn1);
129       vst1q_f32(output + 12, voutCDEFn1);
130       output = (float*restrict) ((uintptr_t) output + output_stride);
131       vst1q_f32(output + 0, vout0123n2);
132       vst1q_f32(output + 4, vout4567n2);
133       vst1q_f32(output + 8, vout89ABn2);
134       vst1q_f32(output + 12, voutCDEFn2);
135       output = (float*restrict) ((uintptr_t) output + output_stride);
136       vst1q_f32(output + 0, vout0123n3);
137       vst1q_f32(output + 4, vout4567n3);
138       vst1q_f32(output + 8, vout89ABn3);
139       vst1q_f32(output + 12, voutCDEFn3);
140       output = (float*restrict) ((uintptr_t) output + output_stride);
141       n -= 4;
142     }
143 
144     // clean up loop, fall back to nr=1
145     if XNN_UNLIKELY(n != 0) {
146       do {
147         uint32_t nnz = *nnzmap++;
148         float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
149         float32x4_t vacc4567 = vacc0123;
150         float32x4_t vacc89AB = vacc0123;
151         float32x4_t vaccCDEF = vacc0123;
152         if XNN_LIKELY(nnz != 0) {
153           do {
154             const intptr_t diff = *dmap++;
155             const float32x4_t vi0123 = vld1q_f32(input);
156             const float32x4_t vi4567 = vld1q_f32(input + 4);
157             const float32x4_t vi89AB = vld1q_f32(input + 8);
158             const float32x4_t viCDEF = vld1q_f32(input + 12);
159             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
160             __builtin_prefetch(input + 16);
161             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
162             __builtin_prefetch(w + 32);
163             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
164             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
165             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
166             vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw);
167           } while (--nnz != 0);
168         }
169         float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
170         float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
171         float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
172         float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
173 
174         vout0123 = vmaxq_f32(vout0123, vmin);
175         vout4567 = vmaxq_f32(vout4567, vmin);
176         vout89AB = vmaxq_f32(vout89AB, vmin);
177         voutCDEF = vmaxq_f32(voutCDEF, vmin);
178 
179         vst1q_f32(output + 0, vout0123);
180         vst1q_f32(output + 4, vout4567);
181         vst1q_f32(output + 8, vout89AB);
182         vst1q_f32(output + 12, voutCDEF);
183         output = (float*restrict) ((uintptr_t) output + output_stride);
184         n -= 1;
185       } while (n != 0);
186     }
187     output = (float*restrict) ((uintptr_t) output - output_decrement);
188     input += 16;
189     mc -= 16 * sizeof(float);
190   }
191   if XNN_UNLIKELY(mc != 0) {
192     output_decrement += 8 * sizeof(float);
193     if (mc & (8 * sizeof(float))) {
194       const float*restrict w = weights;
195       const int32_t* dmap = widx_dmap;
196       const uint32_t* nnzmap = nidx_nnzmap;
197       size_t n = nc;
198       while (n >= 4) {
199         uint32_t nnz = *nnzmap++;
200         float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
201         float32x4_t vacc4567n0 = vacc0123n0;
202         float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
203         float32x4_t vacc4567n1 = vacc0123n1;
204         float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
205         float32x4_t vacc4567n2 = vacc0123n2;
206         float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
207         float32x4_t vacc4567n3 = vacc0123n3;
208         if XNN_LIKELY(nnz != 0) {
209           do {
210             const intptr_t diff = *dmap++;
211             const float32x4_t vi0123 = vld1q_f32(input);
212             const float32x4_t vi4567 = vld1q_f32(input + 4);
213             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
214             const float32x4_t vw = vld1q_f32(w); w += 4;
215 
216             vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
217             vacc4567n0 = vfmaq_laneq_f32(vacc4567n0, vi4567, vw, 0);
218             vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
219             vacc4567n1 = vfmaq_laneq_f32(vacc4567n1, vi4567, vw, 1);
220             vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
221             vacc4567n2 = vfmaq_laneq_f32(vacc4567n2, vi4567, vw, 2);
222             vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
223             vacc4567n3 = vfmaq_laneq_f32(vacc4567n3, vi4567, vw, 3);
224           } while (--nnz != 0);
225         }
226         float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
227         float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
228         float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
229         float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
230         float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
231         float32x4_t vout4567n2 = vminq_f32(vacc4567n2, vmax);
232         float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
233         float32x4_t vout4567n3 = vminq_f32(vacc4567n3, vmax);
234 
235         vout0123n0 = vmaxq_f32(vout0123n0, vmin);
236         vout4567n0 = vmaxq_f32(vout4567n0, vmin);
237         vout0123n1 = vmaxq_f32(vout0123n1, vmin);
238         vout4567n1 = vmaxq_f32(vout4567n1, vmin);
239         vout0123n2 = vmaxq_f32(vout0123n2, vmin);
240         vout4567n2 = vmaxq_f32(vout4567n2, vmin);
241         vout0123n3 = vmaxq_f32(vout0123n3, vmin);
242         vout4567n3 = vmaxq_f32(vout4567n3, vmin);
243 
244         vst1q_f32(output + 0, vout0123n0);
245         vst1q_f32(output + 4, vout4567n0);
246         output = (float*restrict) ((uintptr_t) output + output_stride);
247         vst1q_f32(output + 0, vout0123n1);
248         vst1q_f32(output + 4, vout4567n1);
249         output = (float*restrict) ((uintptr_t) output + output_stride);
250         vst1q_f32(output + 0, vout0123n2);
251         vst1q_f32(output + 4, vout4567n2);
252         output = (float*restrict) ((uintptr_t) output + output_stride);
253         vst1q_f32(output + 0, vout0123n3);
254         vst1q_f32(output + 4, vout4567n3);
255         output = (float*restrict) ((uintptr_t) output + output_stride);
256         n -= 4;
257       }
258 
259       // clean up loop, fall back to nr=1
260       if XNN_UNLIKELY(n != 0) {
261         do {
262           uint32_t nnz = *nnzmap++;
263           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
264           float32x4_t vacc4567 = vacc0123;
265           if XNN_LIKELY(nnz != 0) {
266             do {
267               const intptr_t diff = *dmap++;
268               const float32x4_t vi0123 = vld1q_f32(input);
269               const float32x4_t vi4567 = vld1q_f32(input + 4);
270               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
271               const float32x4_t vw = vld1q_dup_f32(w); w += 1;
272               vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
273               vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
274             } while (--nnz != 0);
275           }
276           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
277           float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
278 
279           vout0123 = vmaxq_f32(vout0123, vmin);
280           vout4567 = vmaxq_f32(vout4567, vmin);
281 
282           vst1q_f32(output + 0, vout0123);
283           vst1q_f32(output + 4, vout4567);
284           output = (float*restrict) ((uintptr_t) output + output_stride);
285           n -= 1;
286         } while (n != 0);
287       }
288       output = (float*restrict) ((uintptr_t) output - output_decrement);
289       input += 8;
290     }
291     output_decrement += 4 * sizeof(float);
292     if (mc & (4 * sizeof(float))) {
293       const float*restrict w = weights;
294       const int32_t* dmap = widx_dmap;
295       const uint32_t* nnzmap = nidx_nnzmap;
296       size_t n = nc;
297       while (n >= 4) {
298         uint32_t nnz = *nnzmap++;
299         float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
300         float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
301         float32x4_t vacc0123n2 = vld1q_dup_f32(w); w += 1;
302         float32x4_t vacc0123n3 = vld1q_dup_f32(w); w += 1;
303         if XNN_LIKELY(nnz != 0) {
304           do {
305             const intptr_t diff = *dmap++;
306             const float32x4_t vi0123 = vld1q_f32(input);
307             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
308             const float32x4_t vw = vld1q_f32(w); w += 4;
309 
310             vacc0123n0 = vfmaq_laneq_f32(vacc0123n0, vi0123, vw, 0);
311             vacc0123n1 = vfmaq_laneq_f32(vacc0123n1, vi0123, vw, 1);
312             vacc0123n2 = vfmaq_laneq_f32(vacc0123n2, vi0123, vw, 2);
313             vacc0123n3 = vfmaq_laneq_f32(vacc0123n3, vi0123, vw, 3);
314           } while (--nnz != 0);
315         }
316         float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
317         float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
318         float32x4_t vout0123n2 = vminq_f32(vacc0123n2, vmax);
319         float32x4_t vout0123n3 = vminq_f32(vacc0123n3, vmax);
320 
321         vout0123n0 = vmaxq_f32(vout0123n0, vmin);
322         vout0123n1 = vmaxq_f32(vout0123n1, vmin);
323         vout0123n2 = vmaxq_f32(vout0123n2, vmin);
324         vout0123n3 = vmaxq_f32(vout0123n3, vmin);
325 
326         vst1q_f32(output + 0, vout0123n0);
327         output = (float*restrict) ((uintptr_t) output + output_stride);
328         vst1q_f32(output + 0, vout0123n1);
329         output = (float*restrict) ((uintptr_t) output + output_stride);
330         vst1q_f32(output + 0, vout0123n2);
331         output = (float*restrict) ((uintptr_t) output + output_stride);
332         vst1q_f32(output + 0, vout0123n3);
333         output = (float*restrict) ((uintptr_t) output + output_stride);
334         n -= 4;
335       }
336 
337       // clean up loop, fall back to nr=1
338       if XNN_UNLIKELY(n != 0) {
339         do {
340           uint32_t nnz = *nnzmap++;
341           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
342           if XNN_LIKELY(nnz != 0) {
343             do {
344               const intptr_t diff = *dmap++;
345               const float32x4_t vi0123 = vld1q_f32(input);
346               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
347               const float32x4_t vw = vld1q_dup_f32(w); w += 1;
348               vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
349             } while (--nnz != 0);
350           }
351           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
352 
353           vout0123 = vmaxq_f32(vout0123, vmin);
354 
355           vst1q_f32(output + 0, vout0123);
356           output = (float*restrict) ((uintptr_t) output + output_stride);
357           n -= 1;
358         } while (n != 0);
359       }
360       output = (float*restrict) ((uintptr_t) output - output_decrement);
361       input += 4;
362     }
363     output_decrement += 2 * sizeof(float);
364     if (mc & (2 * sizeof(float))) {
365       const float*restrict w = weights;
366       const int32_t* dmap = widx_dmap;
367       const uint32_t* nnzmap = nidx_nnzmap;
368       size_t n = nc;
369       while (n >= 4) {
370         uint32_t nnz = *nnzmap++;
371         float32x2_t vacc01n0 = vld1_dup_f32(w); w += 1;
372         float32x2_t vacc01n1 = vld1_dup_f32(w); w += 1;
373         float32x2_t vacc01n2 = vld1_dup_f32(w); w += 1;
374         float32x2_t vacc01n3 = vld1_dup_f32(w); w += 1;
375         if XNN_LIKELY(nnz != 0) {
376           do {
377             const intptr_t diff = *dmap++;
378             const float32x2_t vi01 = vld1_f32(input);
379             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
380             const float32x4_t vw = vld1q_f32(w); w += 4;
381 
382             vacc01n0 = vfma_laneq_f32(vacc01n0, vi01, vw, 0);
383             vacc01n1 = vfma_laneq_f32(vacc01n1, vi01, vw, 1);
384             vacc01n2 = vfma_laneq_f32(vacc01n2, vi01, vw, 2);
385             vacc01n3 = vfma_laneq_f32(vacc01n3, vi01, vw, 3);
386           } while (--nnz != 0);
387         }
388         float32x2_t vout01n0 = vmin_f32(vacc01n0, vget_low_f32(vmax));
389         float32x2_t vout01n1 = vmin_f32(vacc01n1, vget_low_f32(vmax));
390         float32x2_t vout01n2 = vmin_f32(vacc01n2, vget_low_f32(vmax));
391         float32x2_t vout01n3 = vmin_f32(vacc01n3, vget_low_f32(vmax));
392 
393         vout01n0 = vmax_f32(vout01n0, vget_low_f32(vmin));
394         vout01n1 = vmax_f32(vout01n1, vget_low_f32(vmin));
395         vout01n2 = vmax_f32(vout01n2, vget_low_f32(vmin));
396         vout01n3 = vmax_f32(vout01n3, vget_low_f32(vmin));
397 
398         vst1_f32(output + 0, vout01n0);
399         output = (float*restrict) ((uintptr_t) output + output_stride);
400         vst1_f32(output + 0, vout01n1);
401         output = (float*restrict) ((uintptr_t) output + output_stride);
402         vst1_f32(output + 0, vout01n2);
403         output = (float*restrict) ((uintptr_t) output + output_stride);
404         vst1_f32(output + 0, vout01n3);
405         output = (float*restrict) ((uintptr_t) output + output_stride);
406         n -= 4;
407       }
408 
409       // clean up loop, fall back to nr=1
410       if XNN_UNLIKELY(n != 0) {
411         do {
412           uint32_t nnz = *nnzmap++;
413           float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
414           if XNN_LIKELY(nnz != 0) {
415             do {
416               const intptr_t diff = *dmap++;
417               const float32x2_t vi01 = vld1_f32(input);
418               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
419               const float32x2_t vw = vld1_dup_f32(w); w += 1;
420               vacc01 = vfma_f32(vacc01, vi01, vw);
421             } while (--nnz != 0);
422           }
423           float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
424           vout01 = vmax_f32(vout01, vget_low_f32(vmin));
425 
426           vst1_f32(output, vout01);
427           output = (float*restrict) ((uintptr_t) output + output_stride);
428           n -= 1;
429         } while (n != 0);
430       }
431       output = (float*restrict) ((uintptr_t) output - output_decrement);
432       input += 2;
433     }
434     output_decrement += 1 * sizeof(float);
435     if (mc & (1 * sizeof(float))) {
436       const float*restrict w = weights;
437       const int32_t* dmap = widx_dmap;
438       const uint32_t* nnzmap = nidx_nnzmap;
439       size_t n = nc;
440       while (n >= 4) {
441         uint32_t nnz = *nnzmap++;
442         float32x2_t vacc0n0 = vld1_dup_f32(w); w += 1;
443         float32x2_t vacc0n1 = vld1_dup_f32(w); w += 1;
444         float32x2_t vacc0n2 = vld1_dup_f32(w); w += 1;
445         float32x2_t vacc0n3 = vld1_dup_f32(w); w += 1;
446         if XNN_LIKELY(nnz != 0) {
447           do {
448             const intptr_t diff = *dmap++;
449             const float32x2_t vi0 = vld1_dup_f32(input);
450             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
451             const float32x4_t vw = vld1q_f32(w); w += 4;
452 
453             vacc0n0 = vfma_laneq_f32(vacc0n0, vi0, vw, 0);
454             vacc0n1 = vfma_laneq_f32(vacc0n1, vi0, vw, 1);
455             vacc0n2 = vfma_laneq_f32(vacc0n2, vi0, vw, 2);
456             vacc0n3 = vfma_laneq_f32(vacc0n3, vi0, vw, 3);
457           } while (--nnz != 0);
458         }
459         float32x2_t vout0n0 = vmin_f32(vacc0n0, vget_low_f32(vmax));
460         float32x2_t vout0n1 = vmin_f32(vacc0n1, vget_low_f32(vmax));
461         float32x2_t vout0n2 = vmin_f32(vacc0n2, vget_low_f32(vmax));
462         float32x2_t vout0n3 = vmin_f32(vacc0n3, vget_low_f32(vmax));
463 
464         vout0n0 = vmax_f32(vout0n0, vget_low_f32(vmin));
465         vout0n1 = vmax_f32(vout0n1, vget_low_f32(vmin));
466         vout0n2 = vmax_f32(vout0n2, vget_low_f32(vmin));
467         vout0n3 = vmax_f32(vout0n3, vget_low_f32(vmin));
468 
469         vst1_lane_f32(output + 0, vout0n0, 0);
470         output = (float*restrict) ((uintptr_t) output + output_stride);
471         vst1_lane_f32(output + 0, vout0n1, 0);
472         output = (float*restrict) ((uintptr_t) output + output_stride);
473         vst1_lane_f32(output + 0, vout0n2, 0);
474         output = (float*restrict) ((uintptr_t) output + output_stride);
475         vst1_lane_f32(output + 0, vout0n3, 0);
476         output = (float*restrict) ((uintptr_t) output + output_stride);
477         n -= 4;
478       }
479 
480       // clean up loop, fall back to nr=1
481       if XNN_UNLIKELY(n != 0) {
482         do {
483           uint32_t nnz = *nnzmap++;
484           float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
485           if XNN_LIKELY(nnz != 0) {
486             do {
487               const intptr_t diff = *dmap++;
488               const float32x2_t vi0 = vld1_dup_f32(input);
489               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
490               const float32x2_t vw = vld1_dup_f32(w); w += 1;
491               vacc0 = vfma_f32(vacc0, vi0, vw);
492             } while (--nnz != 0);
493           }
494           float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
495           vout0 = vmax_f32(vout0, vget_low_f32(vmin));
496 
497           vst1_lane_f32(output, vout0, 1);
498           output = (float*restrict) ((uintptr_t) output + output_stride);
499           n -= 1;
500         } while (n != 0);
501       }
502       output = (float*restrict) ((uintptr_t) output - output_decrement);
503       input += 1;
504     }
505     }
506 }
507