1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <xnnpack/conv.h>
9 #include <xnnpack/math.h>
10 
11 
xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_channel_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12 void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1(
13     size_t input_height,
14     size_t input_width,
15     size_t output_y_start,
16     size_t output_y_end,
17     const float* input,
18     const float* zero,
19     const float* weights,
20     float* output,
21     size_t input_padding_top,
22     size_t output_channels,
23     size_t output_height_stride,
24     size_t output_channel_stride,
25     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27   assert(input_width != 0);
28   assert(output_y_end > output_y_start);
29   assert(input_padding_top <= 1);
30   assert(output_channels != 0);
31 
32   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
33   const size_t input_width_decrement = round_down_po2(input_width, 2) * 3 /* channels */ * sizeof(float);
34   const size_t output_width = (input_width + 1) / 2;
35   const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
36 
37   // Adjustment for padding processed below
38   const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
39   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
40   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
41   float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
42 
43   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
44     i0 = zero;
45   }
46 
47   const float voutput_max = params->scalar.max;
48   const float voutput_min = params->scalar.min;
49 
50   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 1) {
51     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
52     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
53       i2 = zero;
54     }
55 
56     const float* w = weights;
57     size_t c = output_channels;
58     float* o0c0 = output0;
59     float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
60     float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
61     float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
62     do {
63       if XNN_UNPREDICTABLE(c < 2) {
64         o0c1 = o0c0;
65       }
66       if XNN_UNPREDICTABLE(c <= 2) {
67         o0c2 = o0c1;
68       }
69       if XNN_UNPREDICTABLE(c < 4) {
70         o0c3 = o0c2;
71       }
72 
73       // Left edge padding
74       float vi00c0 = 0.0f;
75       float vi00c1 = 0.0f;
76       float vi00c2 = 0.0f;
77       float vi10c0 = 0.0f;
78       float vi10c1 = 0.0f;
79       float vi10c2 = 0.0f;
80       float vi20c0 = 0.0f;
81       float vi20c1 = 0.0f;
82       float vi20c2 = 0.0f;
83 
84       size_t iw = input_width;
85       for (; iw >= 2; iw -= 2) {
86         float voc0 = w[0];
87         float voc1 = w[1];
88         float voc2 = w[2];
89         float voc3 = w[3];
90 
91         const float vk00c0x0 = w[4];
92         const float vk00c0x1 = w[5];
93         const float vk00c0x2 = w[6];
94         const float vk00c0x3 = w[7];
95 
96         voc0 += vk00c0x0 * vi00c0;
97         voc1 += vk00c0x1 * vi00c0;
98         voc2 += vk00c0x2 * vi00c0;
99         voc3 += vk00c0x3 * vi00c0;
100 
101         const float vk10c0x0 = w[8];
102         const float vk10c0x1 = w[9];
103         const float vk10c0x2 = w[10];
104         const float vk10c0x3 = w[11];
105 
106         voc0 += vk10c0x0 * vi10c0;
107         voc1 += vk10c0x1 * vi10c0;
108         voc2 += vk10c0x2 * vi10c0;
109         voc3 += vk10c0x3 * vi10c0;
110 
111         const float vk20c0x0 = w[12];
112         const float vk20c0x1 = w[13];
113         const float vk20c0x2 = w[14];
114         const float vk20c0x3 = w[15];
115 
116         voc0 += vk20c0x0 * vi20c0;
117         voc1 += vk20c0x1 * vi20c0;
118         voc2 += vk20c0x2 * vi20c0;
119         voc3 += vk20c0x3 * vi20c0;
120 
121         const float vk00c1x0 = w[16];
122         const float vk00c1x1 = w[17];
123         const float vk00c1x2 = w[18];
124         const float vk00c1x3 = w[19];
125 
126         voc0 += vk00c1x0 * vi00c1;
127         voc1 += vk00c1x1 * vi00c1;
128         voc2 += vk00c1x2 * vi00c1;
129         voc3 += vk00c1x3 * vi00c1;
130 
131         const float vk10c1x0 = w[20];
132         const float vk10c1x1 = w[21];
133         const float vk10c1x2 = w[22];
134         const float vk10c1x3 = w[23];
135 
136         voc0 += vk10c1x0 * vi10c1;
137         voc1 += vk10c1x1 * vi10c1;
138         voc2 += vk10c1x2 * vi10c1;
139         voc3 += vk10c1x3 * vi10c1;
140 
141         const float vk20c1x0 = w[24];
142         const float vk20c1x1 = w[25];
143         const float vk20c1x2 = w[26];
144         const float vk20c1x3 = w[27];
145 
146         voc0 += vk20c1x0 * vi20c1;
147         voc1 += vk20c1x1 * vi20c1;
148         voc2 += vk20c1x2 * vi20c1;
149         voc3 += vk20c1x3 * vi20c1;
150 
151         const float vk00c2x0 = w[28];
152         const float vk00c2x1 = w[29];
153         const float vk00c2x2 = w[30];
154         const float vk00c2x3 = w[31];
155 
156         voc0 += vk00c2x0 * vi00c2;
157         voc1 += vk00c2x1 * vi00c2;
158         voc2 += vk00c2x2 * vi00c2;
159         voc3 += vk00c2x3 * vi00c2;
160 
161         const float vk10c2x0 = w[32];
162         const float vk10c2x1 = w[33];
163         const float vk10c2x2 = w[34];
164         const float vk10c2x3 = w[35];
165 
166         voc0 += vk10c2x0 * vi10c2;
167         voc1 += vk10c2x1 * vi10c2;
168         voc2 += vk10c2x2 * vi10c2;
169         voc3 += vk10c2x3 * vi10c2;
170 
171         const float vk20c2x0 = w[36];
172         const float vk20c2x1 = w[37];
173         const float vk20c2x2 = w[38];
174         const float vk20c2x3 = w[39];
175 
176         voc0 += vk20c2x0 * vi20c2;
177         voc1 += vk20c2x1 * vi20c2;
178         voc2 += vk20c2x2 * vi20c2;
179         voc3 += vk20c2x3 * vi20c2;
180 
181         const float vk01c0x0 = w[40];
182         const float vk01c0x1 = w[41];
183         const float vk01c0x2 = w[42];
184         const float vk01c0x3 = w[43];
185 
186         const float vi01c0 = i0[0];
187 
188         voc0 += vk01c0x0 * vi01c0;
189         voc1 += vk01c0x1 * vi01c0;
190         voc2 += vk01c0x2 * vi01c0;
191         voc3 += vk01c0x3 * vi01c0;
192 
193         const float vk11c0x0 = w[44];
194         const float vk11c0x1 = w[45];
195         const float vk11c0x2 = w[46];
196         const float vk11c0x3 = w[47];
197 
198         const float vi11c0 = i1[0];
199 
200         voc0 += vk11c0x0 * vi11c0;
201         voc1 += vk11c0x1 * vi11c0;
202         voc2 += vk11c0x2 * vi11c0;
203         voc3 += vk11c0x3 * vi11c0;
204 
205         const float vk21c0x0 = w[48];
206         const float vk21c0x1 = w[49];
207         const float vk21c0x2 = w[50];
208         const float vk21c0x3 = w[51];
209 
210         const float vi21c0 = i2[0];
211 
212         voc0 += vk21c0x0 * vi21c0;
213         voc1 += vk21c0x1 * vi21c0;
214         voc2 += vk21c0x2 * vi21c0;
215         voc3 += vk21c0x3 * vi21c0;
216 
217         const float vk01c1x0 = w[52];
218         const float vk01c1x1 = w[53];
219         const float vk01c1x2 = w[54];
220         const float vk01c1x3 = w[55];
221 
222         const float vi01c1 = i0[1];
223 
224         voc0 += vk01c1x0 * vi01c1;
225         voc1 += vk01c1x1 * vi01c1;
226         voc2 += vk01c1x2 * vi01c1;
227         voc3 += vk01c1x3 * vi01c1;
228 
229         const float vk11c1x0 = w[56];
230         const float vk11c1x1 = w[57];
231         const float vk11c1x2 = w[58];
232         const float vk11c1x3 = w[59];
233 
234         const float vi11c1 = i1[1];
235 
236         voc0 += vk11c1x0 * vi11c1;
237         voc1 += vk11c1x1 * vi11c1;
238         voc2 += vk11c1x2 * vi11c1;
239         voc3 += vk11c1x3 * vi11c1;
240 
241         const float vk21c1x0 = w[60];
242         const float vk21c1x1 = w[61];
243         const float vk21c1x2 = w[62];
244         const float vk21c1x3 = w[63];
245 
246         const float vi21c1 = i2[1];
247 
248         voc0 += vk21c1x0 * vi21c1;
249         voc1 += vk21c1x1 * vi21c1;
250         voc2 += vk21c1x2 * vi21c1;
251         voc3 += vk21c1x3 * vi21c1;
252 
253         const float vk01c2x0 = w[64];
254         const float vk01c2x1 = w[65];
255         const float vk01c2x2 = w[66];
256         const float vk01c2x3 = w[67];
257 
258         const float vi01c2 = i0[2];
259 
260         voc0 += vk01c2x0 * vi01c2;
261         voc1 += vk01c2x1 * vi01c2;
262         voc2 += vk01c2x2 * vi01c2;
263         voc3 += vk01c2x3 * vi01c2;
264 
265         const float vk11c2x0 = w[68];
266         const float vk11c2x1 = w[69];
267         const float vk11c2x2 = w[70];
268         const float vk11c2x3 = w[71];
269 
270         const float vi11c2 = i1[2];
271 
272         voc0 += vk11c2x0 * vi11c2;
273         voc1 += vk11c2x1 * vi11c2;
274         voc2 += vk11c2x2 * vi11c2;
275         voc3 += vk11c2x3 * vi11c2;
276 
277         const float vk21c2x0 = w[72];
278         const float vk21c2x1 = w[73];
279         const float vk21c2x2 = w[74];
280         const float vk21c2x3 = w[75];
281 
282         const float vi21c2 = i2[2];
283 
284         voc0 += vk21c2x0 * vi21c2;
285         voc1 += vk21c2x1 * vi21c2;
286         voc2 += vk21c2x2 * vi21c2;
287         voc3 += vk21c2x3 * vi21c2;
288 
289         const float vk02c0x0 = w[76];
290         const float vk02c0x1 = w[77];
291         const float vk02c0x2 = w[78];
292         const float vk02c0x3 = w[79];
293 
294         const float vi02c0 = i0[3];
295 
296         voc0 += vk02c0x0 * vi02c0;
297         voc1 += vk02c0x1 * vi02c0;
298         voc2 += vk02c0x2 * vi02c0;
299         voc3 += vk02c0x3 * vi02c0;
300 
301         const float vk12c0x0 = w[80];
302         const float vk12c0x1 = w[81];
303         const float vk12c0x2 = w[82];
304         const float vk12c0x3 = w[83];
305 
306         const float vi12c0 = i1[3];
307 
308         voc0 += vk12c0x0 * vi12c0;
309         voc1 += vk12c0x1 * vi12c0;
310         voc2 += vk12c0x2 * vi12c0;
311         voc3 += vk12c0x3 * vi12c0;
312 
313         const float vk22c0x0 = w[84];
314         const float vk22c0x1 = w[85];
315         const float vk22c0x2 = w[86];
316         const float vk22c0x3 = w[87];
317 
318         const float vi22c0 = i2[3];
319 
320         voc0 += vk22c0x0 * vi22c0;
321         voc1 += vk22c0x1 * vi22c0;
322         voc2 += vk22c0x2 * vi22c0;
323         voc3 += vk22c0x3 * vi22c0;
324 
325         vi00c0 = vi02c0;
326         vi10c0 = vi12c0;
327         vi20c0 = vi22c0;
328 
329         const float vk02c1x0 = w[88];
330         const float vk02c1x1 = w[89];
331         const float vk02c1x2 = w[90];
332         const float vk02c1x3 = w[91];
333 
334         const float vi02c1 = i0[4];
335 
336         voc0 += vk02c1x0 * vi02c1;
337         voc1 += vk02c1x1 * vi02c1;
338         voc2 += vk02c1x2 * vi02c1;
339         voc3 += vk02c1x3 * vi02c1;
340 
341         const float vk12c1x0 = w[92];
342         const float vk12c1x1 = w[93];
343         const float vk12c1x2 = w[94];
344         const float vk12c1x3 = w[95];
345 
346         const float vi12c1 = i1[4];
347 
348         voc0 += vk12c1x0 * vi12c1;
349         voc1 += vk12c1x1 * vi12c1;
350         voc2 += vk12c1x2 * vi12c1;
351         voc3 += vk12c1x3 * vi12c1;
352 
353         const float vk22c1x0 = w[96];
354         const float vk22c1x1 = w[97];
355         const float vk22c1x2 = w[98];
356         const float vk22c1x3 = w[99];
357 
358         const float vi22c1 = i2[4];
359 
360         voc0 += vk22c1x0 * vi22c1;
361         voc1 += vk22c1x1 * vi22c1;
362         voc2 += vk22c1x2 * vi22c1;
363         voc3 += vk22c1x3 * vi22c1;
364 
365         vi00c1 = vi02c1;
366         vi10c1 = vi12c1;
367         vi20c1 = vi22c1;
368 
369         const float vk02c2x0 = w[100];
370         const float vk02c2x1 = w[101];
371         const float vk02c2x2 = w[102];
372         const float vk02c2x3 = w[103];
373 
374         const float vi02c2 = i0[5];
375 
376         voc0 += vk02c2x0 * vi02c2;
377         voc1 += vk02c2x1 * vi02c2;
378         voc2 += vk02c2x2 * vi02c2;
379         voc3 += vk02c2x3 * vi02c2;
380 
381         const float vk12c2x0 = w[104];
382         const float vk12c2x1 = w[105];
383         const float vk12c2x2 = w[106];
384         const float vk12c2x3 = w[107];
385 
386         const float vi12c2 = i1[5];
387 
388         voc0 += vk12c2x0 * vi12c2;
389         voc1 += vk12c2x1 * vi12c2;
390         voc2 += vk12c2x2 * vi12c2;
391         voc3 += vk12c2x3 * vi12c2;
392 
393         const float vk22c2x0 = w[108];
394         const float vk22c2x1 = w[109];
395         const float vk22c2x2 = w[110];
396         const float vk22c2x3 = w[111];
397 
398         const float vi22c2 = i2[5];
399 
400         voc0 += vk22c2x0 * vi22c2;
401         voc1 += vk22c2x1 * vi22c2;
402         voc2 += vk22c2x2 * vi22c2;
403         voc3 += vk22c2x3 * vi22c2;
404 
405         vi00c2 = vi02c2;
406         vi10c2 = vi12c2;
407         vi20c2 = vi22c2;
408 
409         voc0 = math_min_f32(voc0, voutput_max);
410         voc1 = math_min_f32(voc1, voutput_max);
411         voc2 = math_min_f32(voc2, voutput_max);
412         voc3 = math_min_f32(voc3, voutput_max);
413 
414         voc0 = math_max_f32(voc0, voutput_min);
415         voc1 = math_max_f32(voc1, voutput_min);
416         voc2 = math_max_f32(voc2, voutput_min);
417         voc3 = math_max_f32(voc3, voutput_min);
418 
419         *o0c0++ = voc0;
420         *o0c1++ = voc1;
421         *o0c2++ = voc2;
422         *o0c3++ = voc3;
423 
424         i0 += 6;
425         i1 += 6;
426         i2 += 6;
427       }
428       assert(iw < 2);
429       if XNN_UNLIKELY(iw != 0) {
430         float voc0 = w[0];
431         float voc1 = w[1];
432         float voc2 = w[2];
433         float voc3 = w[3];
434 
435         const float vk00c0x0 = w[4];
436         const float vk00c0x1 = w[5];
437         const float vk00c0x2 = w[6];
438         const float vk00c0x3 = w[7];
439 
440         voc0 += vk00c0x0 * vi00c0;
441         voc1 += vk00c0x1 * vi00c0;
442         voc2 += vk00c0x2 * vi00c0;
443         voc3 += vk00c0x3 * vi00c0;
444 
445         const float vk10c0x0 = w[8];
446         const float vk10c0x1 = w[9];
447         const float vk10c0x2 = w[10];
448         const float vk10c0x3 = w[11];
449 
450         voc0 += vk10c0x0 * vi10c0;
451         voc1 += vk10c0x1 * vi10c0;
452         voc2 += vk10c0x2 * vi10c0;
453         voc3 += vk10c0x3 * vi10c0;
454 
455         const float vk20c0x0 = w[12];
456         const float vk20c0x1 = w[13];
457         const float vk20c0x2 = w[14];
458         const float vk20c0x3 = w[15];
459 
460         voc0 += vk20c0x0 * vi20c0;
461         voc1 += vk20c0x1 * vi20c0;
462         voc2 += vk20c0x2 * vi20c0;
463         voc3 += vk20c0x3 * vi20c0;
464 
465         const float vk00c1x0 = w[16];
466         const float vk00c1x1 = w[17];
467         const float vk00c1x2 = w[18];
468         const float vk00c1x3 = w[19];
469 
470         voc0 += vk00c1x0 * vi00c1;
471         voc1 += vk00c1x1 * vi00c1;
472         voc2 += vk00c1x2 * vi00c1;
473         voc3 += vk00c1x3 * vi00c1;
474 
475         const float vk10c1x0 = w[20];
476         const float vk10c1x1 = w[21];
477         const float vk10c1x2 = w[22];
478         const float vk10c1x3 = w[23];
479 
480         voc0 += vk10c1x0 * vi10c1;
481         voc1 += vk10c1x1 * vi10c1;
482         voc2 += vk10c1x2 * vi10c1;
483         voc3 += vk10c1x3 * vi10c1;
484 
485         const float vk20c1x0 = w[24];
486         const float vk20c1x1 = w[25];
487         const float vk20c1x2 = w[26];
488         const float vk20c1x3 = w[27];
489 
490         voc0 += vk20c1x0 * vi20c1;
491         voc1 += vk20c1x1 * vi20c1;
492         voc2 += vk20c1x2 * vi20c1;
493         voc3 += vk20c1x3 * vi20c1;
494 
495         const float vk00c2x0 = w[28];
496         const float vk00c2x1 = w[29];
497         const float vk00c2x2 = w[30];
498         const float vk00c2x3 = w[31];
499 
500         voc0 += vk00c2x0 * vi00c2;
501         voc1 += vk00c2x1 * vi00c2;
502         voc2 += vk00c2x2 * vi00c2;
503         voc3 += vk00c2x3 * vi00c2;
504 
505         const float vk10c2x0 = w[32];
506         const float vk10c2x1 = w[33];
507         const float vk10c2x2 = w[34];
508         const float vk10c2x3 = w[35];
509 
510         voc0 += vk10c2x0 * vi10c2;
511         voc1 += vk10c2x1 * vi10c2;
512         voc2 += vk10c2x2 * vi10c2;
513         voc3 += vk10c2x3 * vi10c2;
514 
515         const float vk20c2x0 = w[36];
516         const float vk20c2x1 = w[37];
517         const float vk20c2x2 = w[38];
518         const float vk20c2x3 = w[39];
519 
520         voc0 += vk20c2x0 * vi20c2;
521         voc1 += vk20c2x1 * vi20c2;
522         voc2 += vk20c2x2 * vi20c2;
523         voc3 += vk20c2x3 * vi20c2;
524 
525         const float vk01c0x0 = w[40];
526         const float vk01c0x1 = w[41];
527         const float vk01c0x2 = w[42];
528         const float vk01c0x3 = w[43];
529 
530         const float vi01c0 = i0[0];
531 
532         voc0 += vk01c0x0 * vi01c0;
533         voc1 += vk01c0x1 * vi01c0;
534         voc2 += vk01c0x2 * vi01c0;
535         voc3 += vk01c0x3 * vi01c0;
536 
537         const float vk11c0x0 = w[44];
538         const float vk11c0x1 = w[45];
539         const float vk11c0x2 = w[46];
540         const float vk11c0x3 = w[47];
541 
542         const float vi11c0 = i1[0];
543 
544         voc0 += vk11c0x0 * vi11c0;
545         voc1 += vk11c0x1 * vi11c0;
546         voc2 += vk11c0x2 * vi11c0;
547         voc3 += vk11c0x3 * vi11c0;
548 
549         const float vk21c0x0 = w[48];
550         const float vk21c0x1 = w[49];
551         const float vk21c0x2 = w[50];
552         const float vk21c0x3 = w[51];
553 
554         const float vi21c0 = i2[0];
555 
556         voc0 += vk21c0x0 * vi21c0;
557         voc1 += vk21c0x1 * vi21c0;
558         voc2 += vk21c0x2 * vi21c0;
559         voc3 += vk21c0x3 * vi21c0;
560 
561         const float vk01c1x0 = w[52];
562         const float vk01c1x1 = w[53];
563         const float vk01c1x2 = w[54];
564         const float vk01c1x3 = w[55];
565 
566         const float vi01c1 = i0[1];
567 
568         voc0 += vk01c1x0 * vi01c1;
569         voc1 += vk01c1x1 * vi01c1;
570         voc2 += vk01c1x2 * vi01c1;
571         voc3 += vk01c1x3 * vi01c1;
572 
573         const float vk11c1x0 = w[56];
574         const float vk11c1x1 = w[57];
575         const float vk11c1x2 = w[58];
576         const float vk11c1x3 = w[59];
577 
578         const float vi11c1 = i1[1];
579 
580         voc0 += vk11c1x0 * vi11c1;
581         voc1 += vk11c1x1 * vi11c1;
582         voc2 += vk11c1x2 * vi11c1;
583         voc3 += vk11c1x3 * vi11c1;
584 
585         const float vk21c1x0 = w[60];
586         const float vk21c1x1 = w[61];
587         const float vk21c1x2 = w[62];
588         const float vk21c1x3 = w[63];
589 
590         const float vi21c1 = i2[1];
591 
592         voc0 += vk21c1x0 * vi21c1;
593         voc1 += vk21c1x1 * vi21c1;
594         voc2 += vk21c1x2 * vi21c1;
595         voc3 += vk21c1x3 * vi21c1;
596 
597         const float vk01c2x0 = w[64];
598         const float vk01c2x1 = w[65];
599         const float vk01c2x2 = w[66];
600         const float vk01c2x3 = w[67];
601 
602         const float vi01c2 = i0[2];
603 
604         voc0 += vk01c2x0 * vi01c2;
605         voc1 += vk01c2x1 * vi01c2;
606         voc2 += vk01c2x2 * vi01c2;
607         voc3 += vk01c2x3 * vi01c2;
608 
609         const float vk11c2x0 = w[68];
610         const float vk11c2x1 = w[69];
611         const float vk11c2x2 = w[70];
612         const float vk11c2x3 = w[71];
613 
614         const float vi11c2 = i1[2];
615 
616         voc0 += vk11c2x0 * vi11c2;
617         voc1 += vk11c2x1 * vi11c2;
618         voc2 += vk11c2x2 * vi11c2;
619         voc3 += vk11c2x3 * vi11c2;
620 
621         const float vk21c2x0 = w[72];
622         const float vk21c2x1 = w[73];
623         const float vk21c2x2 = w[74];
624         const float vk21c2x3 = w[75];
625 
626         const float vi21c2 = i2[2];
627 
628         voc0 += vk21c2x0 * vi21c2;
629         voc1 += vk21c2x1 * vi21c2;
630         voc2 += vk21c2x2 * vi21c2;
631         voc3 += vk21c2x3 * vi21c2;
632 
633         voc0 = math_min_f32(voc0, voutput_max);
634         voc1 = math_min_f32(voc1, voutput_max);
635         voc2 = math_min_f32(voc2, voutput_max);
636         voc3 = math_min_f32(voc3, voutput_max);
637 
638         voc0 = math_max_f32(voc0, voutput_min);
639         voc1 = math_max_f32(voc1, voutput_min);
640         voc2 = math_max_f32(voc2, voutput_min);
641         voc3 = math_max_f32(voc3, voutput_min);
642 
643         *o0c0++ = voc0;
644         *o0c1++ = voc1;
645         *o0c2++ = voc2;
646         *o0c3++ = voc3;
647       }
648       // Move output pointers back to the position of the first pixel in a row,
649       // and forward to the next block of output channels.
650       o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
651       o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
652       o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
653       o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
654       // Revert input pointers to the position of the first pixel in a row
655       i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
656       i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
657       i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
658       // Move to the block of weights for the next 4 output channels
659       w += 112;
660       c = doz(c, 4);
661     } while (c != 0);
662     // Move output pointers forward to the next row
663     output0 = (float*) ((uintptr_t) output0 + output_height_stride);
664     // Move input pointers forward to the next row
665     i0 = i2;
666     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
667     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
668   }
669 }
670