1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-gemm/scalar.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xnnpack/gemm.h>
13 #include <xnnpack/math.h>
14 
15 
xnn_f32_gemminc_minmax_ukernel_4x4__scalar(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const float * restrict acc,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_gemminc_minmax_ukernel_4x4__scalar(
17     size_t mr,
18     size_t nc,
19     size_t kc,
20     const float* restrict a,
21     size_t a_stride,
22     const float* restrict w,
23     float* restrict c,
24     size_t cm_stride,
25     size_t cn_stride,
26     const float*restrict acc,
27     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
28 {
29   assert(mr != 0);
30   assert(mr <= 4);
31   assert(nc != 0);
32   assert(kc != 0);
33   assert(kc % sizeof(float) == 0);
34   assert(a != NULL);
35   assert(w != NULL);
36   assert(c != NULL);
37   assert(acc != NULL);
38 
39   const float* a0 = a;
40   float* c0 = c;
41   const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
42   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
43   if XNN_UNPREDICTABLE(mr < 2) {
44     a1 = a0;
45     c1 = c0;
46   }
47   const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
48   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
49   if XNN_UNPREDICTABLE(mr <= 2) {
50     a2 = a1;
51     c2 = c1;
52   }
53   const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
54   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
55   if XNN_UNPREDICTABLE(mr != 4) {
56     a3 = a2;
57     c3 = c2;
58   }
59 
60   const float vmin = params->scalar.min;
61   const float vmax = params->scalar.max;
62   do {
63     float vacc00 = acc[0];
64     float vacc01 = acc[1];
65     float vacc02 = acc[2];
66     float vacc03 = acc[3];
67     float vacc10 = acc[4];
68     float vacc11 = acc[5];
69     float vacc12 = acc[6];
70     float vacc13 = acc[7];
71     float vacc20 = acc[8];
72     float vacc21 = acc[9];
73     float vacc22 = acc[10];
74     float vacc23 = acc[11];
75     float vacc30 = acc[12];
76     float vacc31 = acc[13];
77     float vacc32 = acc[14];
78     float vacc33 = acc[15];
79     acc += 16;
80 
81     size_t k = kc;
82     do {
83       const float va0 = *a0++;
84       const float va1 = *a1++;
85       const float va2 = *a2++;
86       const float va3 = *a3++;
87 
88       const float vb0 = w[0];
89       const float vb1 = w[1];
90       const float vb2 = w[2];
91       const float vb3 = w[3];
92       w += 4;
93 
94       vacc00 += va0 * vb0;
95       vacc01 += va0 * vb1;
96       vacc02 += va0 * vb2;
97       vacc03 += va0 * vb3;
98       vacc10 += va1 * vb0;
99       vacc11 += va1 * vb1;
100       vacc12 += va1 * vb2;
101       vacc13 += va1 * vb3;
102       vacc20 += va2 * vb0;
103       vacc21 += va2 * vb1;
104       vacc22 += va2 * vb2;
105       vacc23 += va2 * vb3;
106       vacc30 += va3 * vb0;
107       vacc31 += va3 * vb1;
108       vacc32 += va3 * vb2;
109       vacc33 += va3 * vb3;
110 
111       k -= sizeof(float);
112     } while (k != 0);
113 
114     vacc00 = math_max_f32(vacc00, vmin);
115     vacc01 = math_max_f32(vacc01, vmin);
116     vacc02 = math_max_f32(vacc02, vmin);
117     vacc03 = math_max_f32(vacc03, vmin);
118     vacc10 = math_max_f32(vacc10, vmin);
119     vacc11 = math_max_f32(vacc11, vmin);
120     vacc12 = math_max_f32(vacc12, vmin);
121     vacc13 = math_max_f32(vacc13, vmin);
122     vacc20 = math_max_f32(vacc20, vmin);
123     vacc21 = math_max_f32(vacc21, vmin);
124     vacc22 = math_max_f32(vacc22, vmin);
125     vacc23 = math_max_f32(vacc23, vmin);
126     vacc30 = math_max_f32(vacc30, vmin);
127     vacc31 = math_max_f32(vacc31, vmin);
128     vacc32 = math_max_f32(vacc32, vmin);
129     vacc33 = math_max_f32(vacc33, vmin);
130 
131     vacc00 = math_min_f32(vacc00, vmax);
132     vacc01 = math_min_f32(vacc01, vmax);
133     vacc02 = math_min_f32(vacc02, vmax);
134     vacc03 = math_min_f32(vacc03, vmax);
135     vacc10 = math_min_f32(vacc10, vmax);
136     vacc11 = math_min_f32(vacc11, vmax);
137     vacc12 = math_min_f32(vacc12, vmax);
138     vacc13 = math_min_f32(vacc13, vmax);
139     vacc20 = math_min_f32(vacc20, vmax);
140     vacc21 = math_min_f32(vacc21, vmax);
141     vacc22 = math_min_f32(vacc22, vmax);
142     vacc23 = math_min_f32(vacc23, vmax);
143     vacc30 = math_min_f32(vacc30, vmax);
144     vacc31 = math_min_f32(vacc31, vmax);
145     vacc32 = math_min_f32(vacc32, vmax);
146     vacc33 = math_min_f32(vacc33, vmax);
147 
148     if XNN_LIKELY(nc >= 4) {
149       c3[0] = vacc30;
150       c3[1] = vacc31;
151       c3[2] = vacc32;
152       c3[3] = vacc33;
153       c3 = (float*) ((uintptr_t) c3 + cn_stride);
154       c2[0] = vacc20;
155       c2[1] = vacc21;
156       c2[2] = vacc22;
157       c2[3] = vacc23;
158       c2 = (float*) ((uintptr_t) c2 + cn_stride);
159       c1[0] = vacc10;
160       c1[1] = vacc11;
161       c1[2] = vacc12;
162       c1[3] = vacc13;
163       c1 = (float*) ((uintptr_t) c1 + cn_stride);
164       c0[0] = vacc00;
165       c0[1] = vacc01;
166       c0[2] = vacc02;
167       c0[3] = vacc03;
168       c0 = (float*) ((uintptr_t) c0 + cn_stride);
169 
170       a3 = (const void*) ((uintptr_t) a3 - kc);
171       a2 = (const void*) ((uintptr_t) a2 - kc);
172       a1 = (const void*) ((uintptr_t) a1 - kc);
173       a0 = (const void*) ((uintptr_t) a0 - kc);
174 
175       nc -= 4;
176     } else {
177       if (nc & 2) {
178         c3[0] = vacc30;
179         c3[1] = vacc31;
180         vacc30 = vacc32;
181         c3 += 2;
182         c2[0] = vacc20;
183         c2[1] = vacc21;
184         vacc20 = vacc22;
185         c2 += 2;
186         c1[0] = vacc10;
187         c1[1] = vacc11;
188         vacc10 = vacc12;
189         c1 += 2;
190         c0[0] = vacc00;
191         c0[1] = vacc01;
192         vacc00 = vacc02;
193         c0 += 2;
194       }
195       if (nc & 1) {
196         c3[0] = vacc30;
197         c2[0] = vacc20;
198         c1[0] = vacc10;
199         c0[0] = vacc00;
200       }
201 
202       nc = 0;
203     }
204   } while (nc != 0);
205 }
206