1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv2d-chw/3x3s2p1-neon.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4(
19     size_t input_height,
20     size_t input_width,
21     const float* input,
22     const float* weights,
23     const float* zero,
24     float* output,
25     uint32_t padding_top,
26     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(float) == 0);
31   assert(padding_top >= 0);
32   assert(padding_top <= 1);
33 
34   const uint32x4_t vmask_even = vld1q_u32(params->neon.mask_even);
35   const uint32x4_t vmask_odd  = vld1q_u32(params->neon.mask_odd);
36   const float32x4_t vmax = vld1q_dup_f32(&params->neon.max);
37   const float32x4_t vmin = vld1q_dup_f32(&params->neon.min);
38 
39   const float32x4_t vw0123 = vld1q_f32(weights);
40   const float32x4_t vw4567 = vld1q_f32(weights + 4);
41   const float32x2_t vw89 = vld1_f32(weights + 8);
42 
43   const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));
44 
45   const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
46   const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
47   if XNN_UNPREDICTABLE(padding_top != 0) {
48     i0 = zero;
49   }
50   const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
51 
52   float* o0 = output;
53 
54   size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
55   size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
56   do {
57     if XNN_UNPREDICTABLE(padded_input_height < 4) {
58       i2 = zero;
59     }
60 
61     float32x4_t vi0x1357 = vmovq_n_f32(0.0f);
62     float32x4_t vi1x1357 = vmovq_n_f32(0.0f);
63     float32x4_t vi2x1357 = vmovq_n_f32(0.0f);
64 
65     size_t w = input_width;
66     for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
67       float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
68 
69       const float32x4x2_t vi0x8ACE9BDF = vld2q_f32(i0); i0 += 8;
70       const float32x4x2_t vi1x8ACE9BDF = vld2q_f32(i1); i1 += 8;
71       const float32x4x2_t vi2x8ACE9BDF = vld2q_f32(i2); i2 += 8;
72 
73       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 0);
74 
75       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw4567), 1);
76 
77       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vw89, 0);
78 
79       const float32x4_t vi0x7BDF = vextq_f32(vi0x1357, vi0x8ACE9BDF.val[1], 3);
80       vi0x1357 = vi0x8ACE9BDF.val[1];
81       const float32x4_t vi1x7BDF = vextq_f32(vi1x1357, vi1x8ACE9BDF.val[1], 3);
82       vi1x1357 = vi1x8ACE9BDF.val[1];
83       const float32x4_t vi2x7BDF = vextq_f32(vi2x1357, vi2x8ACE9BDF.val[1], 3);
84       vi2x1357 = vi2x8ACE9BDF.val[1];
85 
86       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x7BDF, vget_low_f32(vw0123), 1);
87 
88       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x7BDF, vget_low_f32(vw4567), 0);
89 
90       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x7BDF, vget_high_f32(vw4567), 1);
91 
92       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_high_f32(vw0123), 1);
93 
94       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_high_f32(vw4567), 0);
95 
96       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vw89, 1);
97 
98 
99       float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
100 
101       vo0 = vminq_f32(vo0, vmax);
102 
103       vst1q_f32(o0, vo0); o0 += 4;
104     }
105     // Last block has 0-7 pixels to process.
106     assert(w < 8 * sizeof(float));
107     if XNN_LIKELY(w != 0) {
108       float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
109 
110       const float32x4x2_t vi0x8ACE9BDF = vld2q_f32(i0);
111       const float32x4x2_t vi1x8ACE9BDF = vld2q_f32(i1);
112       const float32x4x2_t vi2x8ACE9BDF = vld2q_f32(i2);
113 
114       const float32x4_t vi0x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi0x8ACE9BDF.val[0])));
115       const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd,  vreinterpretq_u32_f32(vi0x8ACE9BDF.val[1])));
116       const float32x4_t vi1x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi1x8ACE9BDF.val[0])));
117       const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd,  vreinterpretq_u32_f32(vi1x8ACE9BDF.val[1])));
118       const float32x4_t vi2x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi2x8ACE9BDF.val[0])));
119       const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd,  vreinterpretq_u32_f32(vi2x8ACE9BDF.val[1])));
120 
121       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x8ACE, vget_high_f32(vw0123), 0);
122 
123       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x8ACE, vget_low_f32(vw4567), 1);
124 
125       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x8ACE, vw89, 0);
126 
127       const float32x4_t vi0x7BDF = vextq_f32(vi0x1357, vi0x9BDF, 3);
128       const float32x4_t vi1x7BDF = vextq_f32(vi1x1357, vi1x9BDF, 3);
129       const float32x4_t vi2x7BDF = vextq_f32(vi2x1357, vi2x9BDF, 3);
130 
131       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x7BDF, vget_low_f32(vw0123), 1);
132 
133       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x7BDF, vget_low_f32(vw4567), 0);
134 
135       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x7BDF, vget_high_f32(vw4567), 1);
136 
137       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x9BDF, vget_high_f32(vw0123), 1);
138 
139       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x9BDF, vget_high_f32(vw4567), 0);
140 
141       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x9BDF, vw89, 1);
142 
143 
144       float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
145 
146       vo0 = vminq_f32(vo0, vmax);
147 
148       w += 1 * sizeof(float);
149       if (w & (8 * sizeof(float))) {
150         vst1q_f32(o0, vo0); o0 += 4;
151       } else {
152         float32x2_t vo0_lo = vget_low_f32(vo0);
153         if (w & (4 * sizeof(float))) {
154           vst1_f32(o0, vo0_lo); o0 += 2;
155 
156           vo0_lo = vget_high_f32(vo0);
157         }
158         if (w & (2 * sizeof(float))) {
159           vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
160         }
161       }
162     }
163 
164     i0 = (const float*) ((uintptr_t) i2 - input_decrement);
165     i1 = (const float*) ((uintptr_t) i0 + input_width);
166     i2 = (const float*) ((uintptr_t) i1 + input_width);
167 
168 
169     output_height -= 1;
170     padded_input_height -= 2;
171   } while (output_height != 0);
172 }
173