1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <stdint.h>
10 #include <stddef.h>
11 
12 #include <xnnpack/math.h>
13 #include <xnnpack/pack.h>
14 
15 
xnn_pack_f32_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)16 void xnn_pack_f32_gemm_goi_w(
17   size_t g,
18   size_t nc,
19   size_t kc,
20   size_t nr,
21   size_t kr,
22   size_t sr,
23   const float* k,
24   const float* b,
25   float* packed_w,
26   const void* params)
27 {
28   const size_t skr = sr * kr;
29   const size_t skc = round_down_po2(kc, skr);
30   const size_t sr_mask = (sr - 1) * kr;
31   do {
32     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
33       const size_t nr_block_size = min(nc - nr_block_start, nr);
34       if XNN_LIKELY(b != NULL) {
35         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
36           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
37         }
38       }
39       packed_w += nr;
40 
41       for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
42         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
43           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
44             *packed_w++ =
45               k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
46           }
47         }
48         packed_w += (nr - nr_block_size) * kr;
49       }
50 
51       for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
52         const size_t kr_block_size = min(kc - kr_block_start, kr);
53         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
54           for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
55             *packed_w++ =
56               k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
57           }
58           packed_w += kr - kr_block_size;
59         }
60         packed_w += (nr - nr_block_size) * kr;
61       }
62     }
63     k += nc * kc;
64     if XNN_UNPREDICTABLE(b != NULL) {
65       b += nc;
66     }
67   } while (--g != 0);
68 }
69 
xnn_pack_f16_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)70 void xnn_pack_f16_gemm_goi_w(
71   size_t g,
72   size_t nc,
73   size_t kc,
74   size_t nr,
75   size_t kr,
76   size_t sr,
77   const uint16_t* k,
78   const uint16_t* b,
79   uint16_t* packed_w,
80   const void* params)
81 {
82   const size_t skr = sr * kr;
83   const size_t skc = round_down_po2(kc, skr);
84   const size_t sr_mask = (sr - 1) * kr;
85   do {
86     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
87       const size_t nr_block_size = min(nc - nr_block_start, nr);
88       if XNN_LIKELY(b != NULL) {
89         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
90           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
91         }
92       }
93       packed_w += nr;
94 
95       for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
96         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
97           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
98             *packed_w++ =
99               k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
100           }
101         }
102         packed_w += (nr - nr_block_size) * kr;
103       }
104 
105       for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
106         const size_t kr_block_size = min(kc - kr_block_start, kr);
107         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
108           for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
109             *packed_w++ =
110               k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
111           }
112           packed_w += kr - kr_block_size;
113         }
114         packed_w += (nr - nr_block_size) * kr;
115       }
116     }
117     k += nc * kc;
118     if XNN_UNPREDICTABLE(b != NULL) {
119       b += nc;
120     }
121   } while (--g != 0);
122 }
123 
xnn_pack_qu8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)124 void xnn_pack_qu8_gemm_goi_w(
125   size_t g,
126   size_t nc,
127   size_t kc,
128   size_t nr,
129   size_t kr,
130   size_t sr,
131   const uint8_t* k,
132   const int32_t* b,
133   void* packed_w,
134   const struct xnn_qu8_packing_params* params)
135 {
136   assert(sr == 1);
137   const int32_t izp = (int32_t) params->input_zero_point;
138   const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
139   do {
140     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
141       const size_t nr_block_size = min(nc - nr_block_start, nr);
142       int32_t* packed_b = (int32_t*) packed_w;
143       if XNN_LIKELY(b != NULL) {
144         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
145           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
146           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
147         }
148       } else {
149         size_t n = nr_block_size;
150         do {
151           *((int32_t*) packed_w) = boff;
152           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153         } while (--n != 0);
154       }
155       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
156       for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
157         const size_t kr_block_size = min(kc - kr_block_start, kr);
158         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
159           int32_t ksum = 0;
160           for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
161             const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
162             ksum += (int32_t) kv;
163             *((uint8_t*) packed_w) = kv;
164             packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
165           }
166           packed_b[nr_block_offset] -= ksum * izp;
167           packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
168         }
169         packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
170       }
171     }
172     k += nc * kc;
173     if XNN_UNPREDICTABLE(b != NULL) {
174       b += nc;
175     }
176   } while (--g != 0);
177 }
178 
xnn_pack_qs8_gemm_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)179 void xnn_pack_qs8_gemm_goi_w(
180   size_t g,
181   size_t nc,
182   size_t kc,
183   size_t nr,
184   size_t kr,
185   size_t sr,
186   const int8_t* k,
187   const int32_t* b,
188   void* packed_w,
189   const struct xnn_qs8_packing_params* params)
190 {
191   assert(sr == 1);
192   const int32_t izp = (int32_t) params->input_zero_point;
193   do {
194     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
195       const size_t nr_block_size = min(nc - nr_block_start, nr);
196       int32_t* packed_b = (int32_t*) packed_w;
197       if XNN_LIKELY(b != NULL) {
198         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
199           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
200           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
201         }
202       } else {
203         size_t n = nr_block_size;
204         do {
205           *((int32_t*) packed_w) = 0;
206           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
207         } while (--n != 0);
208       }
209       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
210       for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
211         const size_t kr_block_size = min(kc - kr_block_start, kr);
212         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
213           int32_t ksum = 0;
214           for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
215             const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
216             ksum += (int32_t) kv;
217             *((int8_t*) packed_w) = kv;
218             packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
219           }
220           packed_b[nr_block_offset] -= ksum * izp;
221           packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
222         }
223         packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
224       }
225     }
226     k += nc * kc;
227     if XNN_UNPREDICTABLE(b != NULL) {
228       b += nc;
229     }
230   } while (--g != 0);
231 }
232 
xnn_pack_qs8_gemm_xw_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)233 void xnn_pack_qs8_gemm_xw_goi_w(
234   size_t g,
235   size_t nc,
236   size_t kc,
237   size_t nr,
238   size_t kr,
239   size_t sr,
240   const int8_t* k,
241   const int32_t* b,
242   void* packed_w,
243   const struct xnn_qs8_packing_params* params)
244 {
245   assert(sr == 1);
246   const int32_t izp = (int32_t) params->input_zero_point;
247   do {
248     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
249       const size_t nr_block_size = min(nc - nr_block_start, nr);
250       int32_t* packed_b = (int32_t*) packed_w;
251       if XNN_LIKELY(b != NULL) {
252         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
253           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
254           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
255         }
256       } else {
257         size_t n = nr_block_size;
258         do {
259           *((int32_t*) packed_w) = 0;
260           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
261         } while (--n != 0);
262       }
263       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
264       for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
265         const size_t kr_block_size = min(kc - kr_block_start, kr);
266         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
267           int32_t ksum = 0;
268           for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
269             const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
270             ksum += (int32_t) kv;
271             *((int16_t*) packed_w) = (int16_t) kv;
272             packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
273           }
274           packed_b[nr_block_offset] -= ksum * izp;
275           packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
276         }
277         packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
278       }
279     }
280     k += nc * kc;
281     if XNN_UNPREDICTABLE(b != NULL) {
282       b += nc;
283     }
284   } while (--g != 0);
285 }
286 
xnn_pack_f32_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)287 void xnn_pack_f32_gemm_io_w(
288   size_t nc,
289   size_t kc,
290   size_t nr,
291   size_t kr,
292   size_t sr,
293   const float* k,
294   const float* b,
295   float* packed_w,
296   const void* params)
297 {
298   const size_t skr = sr * kr;
299   const size_t skc = round_down_po2(kc, skr);
300   const size_t sr_mask = (sr - 1) * kr;
301   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
302     const size_t nr_block_size = min(nc - nr_block_start, nr);
303     if XNN_LIKELY(b != NULL) {
304       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
305         packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
306       }
307     }
308     packed_w += nr;
309 
310     for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
311       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
312         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
313           *packed_w++ =
314             k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
315         }
316       }
317       packed_w += (nr - nr_block_size) * kr;
318     }
319 
320     for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
321       const size_t kr_block_size = min(kc - kr_block_start, kr);
322       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
323         for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
324           *packed_w++ =
325             k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
326         }
327         packed_w += kr - kr_block_size;
328       }
329       packed_w += (nr - nr_block_size) * kr;
330     }
331   }
332 }
333 
xnn_pack_f16_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)334 void xnn_pack_f16_gemm_io_w(
335   size_t nc,
336   size_t kc,
337   size_t nr,
338   size_t kr,
339   size_t sr,
340   const uint16_t* k,
341   const uint16_t* b,
342   uint16_t* packed_w,
343   const void* params)
344 {
345   const size_t skr = sr * kr;
346   const size_t skc = round_down_po2(kc, skr);
347   const size_t sr_mask = (sr - 1) * kr;
348   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
349     const size_t nr_block_size = min(nc - nr_block_start, nr);
350     if XNN_LIKELY(b != NULL) {
351       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
352         packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
353       }
354     }
355     packed_w += nr;
356 
357     for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
358       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
359         for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
360           *packed_w++ =
361             k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
362         }
363       }
364       packed_w += (nr - nr_block_size) * kr;
365     }
366 
367     for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
368       const size_t kr_block_size = min(kc - kr_block_start, kr);
369       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
370         for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
371           *packed_w++ =
372             k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
373         }
374         packed_w += kr - kr_block_size;
375       }
376       packed_w += (nr - nr_block_size) * kr;
377     }
378   }
379 }
380 
xnn_pack_qu8_gemm_io_w(size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)381 void xnn_pack_qu8_gemm_io_w(
382   size_t nc,
383   size_t kc,
384   size_t nr,
385   size_t kr,
386   size_t sr,
387   const uint8_t* k,
388   const int32_t* b,
389   void* packed_w,
390   const struct xnn_qu8_packing_params* params)
391 {
392   assert(sr == 1);
393   const int32_t izp = (int32_t) params->input_zero_point;
394   const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
395   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396     const size_t nr_block_size = min(nc - nr_block_start, nr);
397     int32_t* packed_b = (int32_t*) packed_w;
398     if XNN_LIKELY(b != NULL) {
399       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
400         *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
401         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
402       }
403     } else {
404       size_t n = nr_block_size;
405       do {
406         *((int32_t*) packed_w) = boff;
407         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
408       } while (--n != 0);
409     }
410     packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
411     for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
412       const size_t kr_block_size = min(kc - kr_block_start, kr);
413       for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
414         int32_t ksum = 0;
415         for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
416           const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
417           ksum += (int32_t) kv;
418           *((uint8_t*) packed_w) = kv;
419           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
420         }
421         packed_b[nr_block_offset] -= ksum * izp;
422         packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
423       }
424       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
425     }
426   }
427 }
428 
xnn_pack_f32_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,const void * params)429 void xnn_pack_f32_conv_goki_w(
430   size_t g,
431   size_t nc,
432   size_t ks,
433   size_t kc,
434   size_t nr,
435   size_t kr,
436   size_t sr,
437   const float* k,
438   const float* b,
439   float* packed_w,
440   const void* params)
441 {
442   const size_t skr = sr * kr;
443   const size_t skc = round_down_po2(kc, skr);
444   const size_t sr_mask = (sr - 1) * kr;
445   do {
446     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
447       const size_t nr_block_size = min(nc - nr_block_start, nr);
448       if XNN_LIKELY(b != NULL) {
449         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
450           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
451         }
452       }
453       packed_w += nr;
454 
455       for (size_t ki = 0; ki < ks; ki++) {
456         for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
457           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
458             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
459               *packed_w++ =
460                 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
461             }
462           }
463           packed_w += (nr - nr_block_size) * kr;
464         }
465 
466         for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
467           const size_t kr_block_size = min(kc - kr_block_start, kr);
468           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
469             for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
470               *packed_w++ =
471                 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
472             }
473             packed_w += kr - kr_block_size;
474           }
475           packed_w += (nr - nr_block_size) * kr;
476         }
477       }
478     }
479     k += ks * kc * nc;
480     if XNN_UNPREDICTABLE(b != NULL) {
481       b += nc;
482     }
483   } while (--g != 0);
484 }
485 
xnn_pack_f16_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)486 void xnn_pack_f16_conv_goki_w(
487   size_t g,
488   size_t nc,
489   size_t ks,
490   size_t kc,
491   size_t nr,
492   size_t kr,
493   size_t sr,
494   const uint16_t* k,
495   const uint16_t* b,
496   uint16_t* packed_w,
497   const void* params)
498 {
499   const size_t skr = sr * kr;
500   const size_t skc = round_down_po2(kc, skr);
501   const size_t sr_mask = (sr - 1) * kr;
502   do {
503     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
504       const size_t nr_block_size = min(nc - nr_block_start, nr);
505       if XNN_LIKELY(b != NULL) {
506         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
507           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
508         }
509       }
510       packed_w += nr;
511 
512       for (size_t ki = 0; ki < ks; ki++) {
513         for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
514           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
515             for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
516               *packed_w++ =
517                 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
518             }
519           }
520           packed_w += (nr - nr_block_size) * kr;
521         }
522 
523         for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
524           const size_t kr_block_size = min(kc - kr_block_start, kr);
525           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
526             for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
527               *packed_w++ =
528                 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
529             }
530             packed_w += kr - kr_block_size;
531           }
532           packed_w += (nr - nr_block_size) * kr;
533         }
534       }
535     }
536     k += ks * kc * nc;
537     if XNN_UNPREDICTABLE(b != NULL) {
538       b += nc;
539     }
540   } while (--g != 0);
541 }
542 
xnn_pack_qu8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)543 void xnn_pack_qu8_conv_goki_w(
544   size_t g,
545   size_t nc,
546   size_t ks,
547   size_t kc,
548   size_t nr,
549   size_t kr,
550   size_t sr,
551   const uint8_t* k,
552   const int32_t* b,
553   void* packed_w,
554   const struct xnn_qu8_packing_params* params)
555 {
556   assert(sr == 1);
557   const int32_t izp = (int32_t) params->input_zero_point;
558   const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
559   do {
560     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
561       const size_t nr_block_size = min(nc - nr_block_start, nr);
562       int32_t* packed_b = (int32_t*) packed_w;
563       if XNN_LIKELY(b != NULL) {
564         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
565           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
566           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
567         }
568       } else {
569         size_t n = nr_block_size;
570         do {
571           *((int32_t*) packed_w) = boff;
572           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
573         } while (--n != 0);
574       }
575       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
576       for (size_t ki = 0; ki < ks; ki++) {
577         for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
578           const size_t kr_block_size = min(kc - kr_block_start, kr);
579           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
580             int32_t ksum = 0;
581             for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
582               const uint8_t kv =
583                 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
584               ksum += (int32_t) kv;
585               *((uint8_t*) packed_w) = kv;
586               packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
587             }
588             packed_b[nr_block_offset] -= ksum * izp;
589             packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
590           }
591           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
592         }
593       }
594     }
595     k += ks * kc * nc;
596     if XNN_UNPREDICTABLE(b != NULL) {
597       b += nc;
598     }
599   } while (--g != 0);
600 }
601 
xnn_pack_qs8_conv_goki_w(size_t g,size_t nc,size_t ks,size_t kc,size_t nr,size_t kr,size_t sr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)602 void xnn_pack_qs8_conv_goki_w(
603   size_t g,
604   size_t nc,
605   size_t ks,
606   size_t kc,
607   size_t nr,
608   size_t kr,
609   size_t sr,
610   const int8_t* k,
611   const int32_t* b,
612   void* packed_w,
613   const struct xnn_qs8_packing_params* params)
614 {
615   assert(sr == 1);
616   const int32_t izp = (int32_t) params->input_zero_point;
617   do {
618     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
619       const size_t nr_block_size = min(nc - nr_block_start, nr);
620       int32_t* packed_b = (int32_t*) packed_w;
621       if XNN_LIKELY(b != NULL) {
622         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
623           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
624           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
625         }
626       } else {
627         size_t n = nr_block_size;
628         do {
629           *((int32_t*) packed_w) = 0;
630           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
631         } while (--n != 0);
632       }
633       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
634       for (size_t ki = 0; ki < ks; ki++) {
635         for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
636           const size_t kr_block_size = min(kc - kr_block_start, kr);
637           for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
638             int32_t ksum = 0;
639             for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
640               const int8_t kv =
641                 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
642               ksum += (int32_t) kv;
643               *((int8_t*) packed_w) = kv;
644               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
645             }
646             packed_b[nr_block_offset] -= ksum * izp;
647             packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
648           }
649           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
650         }
651       }
652     }
653     k += ks * kc * nc;
654     if XNN_UNPREDICTABLE(b != NULL) {
655       b += nc;
656     }
657   } while (--g != 0);
658 }
659 
xnn_pack_f32_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,const float * k,const float * b,float * packed_w,const void * params)660 void xnn_pack_f32_conv_kgo_w(
661   size_t g,
662   size_t nc,
663   size_t ks,
664   size_t nr,
665   size_t kr,
666   const float* k,
667   const float* b,
668   float* packed_w,
669   const void* params)
670 {
671   for (size_t i = 0; i < g; i++) {
672     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
673       const size_t nr_block_size = min(nc - nr_block_start, nr);
674       if XNN_LIKELY(b != NULL) {
675         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
676           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
677         }
678       }
679       packed_w += nr;
680       for (size_t ki = 0; ki < ks; ki++) {
681         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
682           *packed_w =
683             k[ki * g * nc + (nr_block_start + nr_block_offset)];
684           packed_w += kr;
685         }
686         packed_w += (nr - nr_block_size) * kr;
687       }
688     }
689     k += nc;
690     if XNN_UNPREDICTABLE(b != NULL) {
691       b += nc;
692     }
693   }
694 }
695 
xnn_pack_f16_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)696 void xnn_pack_f16_conv_kgo_w(
697   size_t g,
698   size_t nc,
699   size_t ks,
700   size_t nr,
701   size_t kr,
702   const uint16_t* k,
703   const uint16_t* b,
704   uint16_t* packed_w,
705   const void* params)
706 {
707   for (size_t i = 0; i < g; i++) {
708     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
709       const size_t nr_block_size = min(nc - nr_block_start, nr);
710       if XNN_LIKELY(b != NULL) {
711         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
712           packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
713         }
714       }
715       packed_w += nr;
716       for (size_t ki = 0; ki < ks; ki++) {
717         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
718           *packed_w =
719             k[ki * g * nc + (nr_block_start + nr_block_offset)];
720           packed_w += kr;
721         }
722         packed_w += (nr - nr_block_size) * kr;
723       }
724     }
725     k += nc;
726     if XNN_UNPREDICTABLE(b != NULL) {
727       b += nc;
728     }
729   }
730 }
731 
xnn_pack_qu8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)732 void xnn_pack_qu8_conv_kgo_w(
733   size_t g,
734   size_t nc,
735   size_t ks,
736   size_t nr,
737   size_t kr,
738   const uint8_t* k,
739   const int32_t* b,
740   void* packed_w,
741   const struct xnn_qu8_packing_params* params)
742 {
743   const int32_t izp = (int32_t) params->input_zero_point;
744   const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
745   for (size_t i = 0; i < g; i++) {
746     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
747       const size_t nr_block_size = min(nc - nr_block_start, nr);
748       int32_t* packed_b = (int32_t*) packed_w;
749       if XNN_LIKELY(b != NULL) {
750         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
751           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
752           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
753         }
754       } else {
755         size_t n = nr_block_size;
756         do {
757           *((int32_t*) packed_w) = boff;
758           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
759         } while (--n != 0);
760       }
761       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
762       for (size_t ki = 0; ki < ks; ki++) {
763         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
764           const uint8_t kv =
765             k[ki * g * nc + (nr_block_start + nr_block_offset)];
766           *((uint8_t*) packed_w) = kv;
767           packed_b[nr_block_offset] -= (int32_t) kv * izp;
768           packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
769         }
770         packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
771       }
772     }
773     k += nc;
774     if XNN_UNPREDICTABLE(b != NULL) {
775       b += nc;
776     }
777   }
778 }
779 
xnn_pack_qs8_conv_kgo_w(size_t g,size_t nc,size_t ks,size_t nr,size_t kr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)780 void xnn_pack_qs8_conv_kgo_w(
781   size_t g,
782   size_t nc,
783   size_t ks,
784   size_t nr,
785   size_t kr,
786   const int8_t* k,
787   const int32_t* b,
788   void* packed_w,
789   const struct xnn_qs8_packing_params* params)
790 {
791   const int32_t izp = (int32_t) params->input_zero_point;
792   for (size_t i = 0; i < g; i++) {
793     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
794       const size_t nr_block_size = min(nc - nr_block_start, nr);
795       int32_t* packed_b = (int32_t*) packed_w;
796       if XNN_LIKELY(b != NULL) {
797         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
798           *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
799           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
800         }
801       } else {
802         size_t n = nr_block_size;
803         do {
804           *((int32_t*) packed_w) = 0;
805           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
806         } while (--n != 0);
807       }
808       packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
809       for (size_t ki = 0; ki < ks; ki++) {
810         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
811           const int8_t kv =
812             k[ki * g * nc + (nr_block_start + nr_block_offset)];
813           *((int8_t*) packed_w) = kv;
814           packed_b[nr_block_offset] -= (int32_t) kv * izp;
815           packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
816         }
817         packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
818       }
819     }
820     k += nc;
821     if XNN_UNPREDICTABLE(b != NULL) {
822       b += nc;
823     }
824   }
825 }
826 
xnn_pack_f32_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const float * k,const float * b,float * packed_w,struct subconvolution_params * subconv_params,const void * params)827 void xnn_pack_f32_deconv_goki_w(
828   size_t g,
829   size_t nc,
830   size_t kh,
831   size_t kw,
832   size_t kc,
833   size_t sh,
834   size_t sw,
835   size_t nr,
836   size_t kr,
837   size_t sr,
838   const float* k,
839   const float* b,
840   float* packed_w,
841   struct subconvolution_params* subconv_params,
842   const void* params)
843 {
844   const size_t skr = sr * kr;
845   const size_t skc = round_down_po2(kc, skr);
846   const size_t sr_mask = (sr - 1) * kr;
847   for (size_t i = 0; i < g; i++) {
848     for (size_t oy = 0; oy < sh; oy++) {
849       for (size_t ox = 0; ox < sw; ox++) {
850         if (i == 0) {
851           (*subconv_params++).weights = packed_w;
852         }
853         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
854           const size_t nr_block_size = min(nc - nr_block_start, nr);
855           if XNN_LIKELY(b != NULL) {
856             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
857               packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
858             }
859           }
860           packed_w += nr;
861           for (size_t ky = oy; ky < kh; ky += sh) {
862             for (size_t kx = ox; kx < kw; kx += sw) {
863               for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
864                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
865                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
866                     *packed_w++ =
867                       k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
868                   }
869                 }
870                 packed_w += (nr - nr_block_size) * kr;
871               }
872 
873               for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
874                 const size_t kr_block_size = min(kc - kr_block_start, kr);
875                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
876                   for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
877                     *packed_w++ =
878                       k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
879                   }
880                   packed_w += kr - kr_block_size;
881                 }
882                 packed_w += (nr - nr_block_size) * kr;
883               }
884             }
885           }
886         }
887       }
888     }
889     k += kh * kw * kc * nc;
890     if XNN_UNPREDICTABLE(b != NULL) {
891       b += nc;
892     }
893   }
894 }
895 
xnn_pack_f16_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,struct subconvolution_params * subconv_params,const void * params)896 void xnn_pack_f16_deconv_goki_w(
897   size_t g,
898   size_t nc,
899   size_t kh,
900   size_t kw,
901   size_t kc,
902   size_t sh,
903   size_t sw,
904   size_t nr,
905   size_t kr,
906   size_t sr,
907   const uint16_t* k,
908   const uint16_t* b,
909   uint16_t* packed_w,
910   struct subconvolution_params* subconv_params,
911   const void* params)
912 {
913   const size_t skr = sr * kr;
914   const size_t skc = round_down_po2(kc, skr);
915   const size_t sr_mask = (sr - 1) * kr;
916   for (size_t i = 0; i < g; i++) {
917     for (size_t oy = 0; oy < sh; oy++) {
918       for (size_t ox = 0; ox < sw; ox++) {
919         if (i == 0) {
920           (*subconv_params++).weights = packed_w;
921         }
922         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
923           const size_t nr_block_size = min(nc - nr_block_start, nr);
924           if XNN_LIKELY(b != NULL) {
925             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
926               packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
927             }
928           }
929           packed_w += nr;
930           for (size_t ky = oy; ky < kh; ky += sh) {
931             for (size_t kx = ox; kx < kw; kx += sw) {
932               for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
933                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
934                   for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
935                     *packed_w++ =
936                       k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
937                   }
938                 }
939                 packed_w += (nr - nr_block_size) * kr;
940               }
941 
942               for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
943                 const size_t kr_block_size = min(kc - kr_block_start, kr);
944                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
945                   for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
946                     *packed_w++ =
947                       k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
948                   }
949                   packed_w += kr - kr_block_size;
950                 }
951                 packed_w += (nr - nr_block_size) * kr;
952               }
953             }
954           }
955         }
956       }
957     }
958     k += kh * kw * kc * nc;
959     if XNN_UNPREDICTABLE(b != NULL) {
960       b += nc;
961     }
962   }
963 }
964 
xnn_pack_qu8_deconv_goki_w(size_t g,size_t nc,size_t kh,size_t kw,size_t kc,size_t sh,size_t sw,size_t nr,size_t kr,size_t sr,const uint8_t * k,const int32_t * b,void * packed_w,struct subconvolution_params * subconv_params,const struct xnn_qu8_packing_params * params)965 void xnn_pack_qu8_deconv_goki_w(
966   size_t g,
967   size_t nc,
968   size_t kh,
969   size_t kw,
970   size_t kc,
971   size_t sh,
972   size_t sw,
973   size_t nr,
974   size_t kr,
975   size_t sr,
976   const uint8_t* k,
977   const int32_t* b,
978   void* packed_w,
979   struct subconvolution_params* subconv_params,
980   const struct xnn_qu8_packing_params* params)
981 {
982   assert(sr == 1);
983   const int32_t izp = (int32_t) params->input_zero_point;
984   const int32_t kzp = (int32_t) params->kernel_zero_point;
985   for (size_t i = 0; i < g; i++) {
986     for (size_t oy = 0; oy < sh; oy++) {
987       for (size_t ox = 0; ox < sw; ox++) {
988         if (i == 0) {
989           (*subconv_params++).weights = packed_w;
990         }
991         const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
992         for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
993           const size_t nr_block_size = min(nc - nr_block_start, nr);
994           int32_t* packed_b = (int32_t*) packed_w;
995           if XNN_LIKELY(b != 0) {
996             for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
997               *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
998               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
999             }
1000           } else {
1001             size_t n = nr_block_size;
1002             do {
1003               *((int32_t*) packed_w) = boff;
1004               packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1005             } while (--n != 0);
1006           }
1007           packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1008           for (size_t ky = oy; ky < kh; ky += sh) {
1009             for (size_t kx = ox; kx < kw; kx += sw) {
1010               for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1011                 const size_t kr_block_size = min(kc - kr_block_start, kr);
1012                 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1013                   int32_t ksum = 0;
1014                   for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1015                     const uint8_t kv =
1016                       k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1017                     ksum += (int32_t) kv;
1018                     *((uint8_t*) packed_w) = kv;
1019                     packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1020                   }
1021                   packed_b[nr_block_offset] -= ksum * izp;
1022                   packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
1023                 }
1024                 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
1025               }
1026             }
1027           }
1028         }
1029       }
1030     }
1031     k += kh * kw * kc * nc;
1032     if XNN_UNPREDICTABLE(b != NULL) {
1033       b += nc;
1034     }
1035   }
1036 }
1037 
xnn_pack_f32_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,const void * params)1038 void xnn_pack_f32_dwconv_ghw_w(
1039   size_t h,
1040   size_t w,
1041   size_t c,
1042   size_t cr,
1043   const float* k,
1044   const float* b,
1045   float* packed_w,
1046   const void* params)
1047 {
1048   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1049     const size_t cr_block_size = min(c - cr_block_start, cr);
1050     if XNN_LIKELY(b != NULL) {
1051       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1052         *packed_w++ = b[cr_block_start + cr_block_offset];
1053       }
1054     } else {
1055       size_t n = cr_block_size;
1056       do {
1057         *packed_w++ = 0.0f;
1058       } while (--n != 0);
1059     }
1060     packed_w += cr - cr_block_size;
1061     for (size_t x = 0; x < w; x++) {
1062       for (size_t y = 0; y < h; y++) {
1063         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1064           const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1065           *packed_w++ = kv;
1066         }
1067         packed_w += cr - cr_block_size;
1068       }
1069     }
1070   }
1071 }
1072 
xnn_pack_f16_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1073 void xnn_pack_f16_dwconv_ghw_w(
1074   size_t h,
1075   size_t w,
1076   size_t c,
1077   size_t cr,
1078   const uint16_t* k,
1079   const uint16_t* b,
1080   uint16_t* packed_w,
1081   const void* params)
1082 {
1083   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1084     const size_t cr_block_size = min(c - cr_block_start, cr);
1085     if XNN_LIKELY(b != NULL) {
1086       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1087         *packed_w++ = b[cr_block_start + cr_block_offset];
1088       }
1089     } else {
1090       size_t n = cr_block_size;
1091       do {
1092         *packed_w++ = 0;
1093       } while (--n != 0);
1094     }
1095     packed_w += cr - cr_block_size;
1096     for (size_t x = 0; x < w; x++) {
1097       for (size_t y = 0; y < h; y++) {
1098         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1099           const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1100           *packed_w++ = kv;
1101         }
1102         packed_w += cr - cr_block_size;
1103       }
1104     }
1105   }
1106 }
1107 
xnn_pack_qu8_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)1108 void xnn_pack_qu8_dwconv_ghw_w(
1109   size_t h,
1110   size_t w,
1111   size_t c,
1112   size_t cr,
1113   const uint8_t* k,
1114   const int32_t* b,
1115   void* packed_w,
1116   const struct xnn_qu8_packing_params* params)
1117 {
1118   const int32_t izp = (int32_t) params->input_zero_point;
1119   const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1120   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1121     const size_t cr_block_size = min(c - cr_block_start, cr);
1122     int32_t* packed_b = (int32_t*) packed_w;
1123     if XNN_LIKELY(b != NULL) {
1124       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1125         *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1126         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1127       }
1128     } else {
1129       size_t n = cr_block_size;
1130       do {
1131         *((int32_t*) packed_w) = boff;
1132         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1133       } while (--n != 0);
1134     }
1135     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1136     for (size_t x = 0; x < w; x++) {
1137       for (size_t y = 0; y < h; y++) {
1138         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1139           const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1140           packed_b[cr_block_offset] -= (int32_t) kv * izp;
1141           *((uint8_t*) packed_w) = kv;
1142           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1143         }
1144         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1145       }
1146     }
1147   }
1148 }
1149 
xnn_pack_qs8_dwconv_ghw_w(size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)1150 void xnn_pack_qs8_dwconv_ghw_w(
1151   size_t h,
1152   size_t w,
1153   size_t c,
1154   size_t cr,
1155   const int8_t* k,
1156   const int32_t* b,
1157   void* packed_w,
1158   const struct xnn_qs8_packing_params* params)
1159 {
1160   const int32_t izp = (int32_t) params->input_zero_point;
1161   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1162     const size_t cr_block_size = min(c - cr_block_start, cr);
1163     int32_t* packed_b = (int32_t*) packed_w;
1164     if XNN_LIKELY(b != NULL) {
1165       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1166         *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1167         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1168       }
1169     } else {
1170       size_t n = cr_block_size;
1171       do {
1172         *((int32_t*) packed_w) = 0;
1173         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1174       } while (--n != 0);
1175     }
1176     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1177     for (size_t x = 0; x < w; x++) {
1178       for (size_t y = 0; y < h; y++) {
1179         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1180           const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1181           packed_b[cr_block_offset] -= (int32_t) kv * izp;
1182           *((int8_t*) packed_w) = kv;
1183           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1184         }
1185         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1186       }
1187     }
1188   }
1189 }
1190 
xnn_pack_f32_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const float * k,const float * b,float * packed_w,const void * params)1191 void xnn_pack_f32_dwconv_hwg_w(
1192   size_t h,
1193   size_t w,
1194   size_t c,
1195   size_t cr,
1196   const float* k,
1197   const float* b,
1198   float* packed_w,
1199   const void* params)
1200 {
1201   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1202     const size_t cr_block_size = min(c - cr_block_start, cr);
1203     if XNN_LIKELY(b != NULL) {
1204       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1205         *packed_w++ = b[cr_block_start + cr_block_offset];
1206       }
1207     } else {
1208       size_t n = cr_block_size;
1209       do {
1210         *packed_w++ = 0.0f;
1211       } while (--n != 0);
1212     }
1213     packed_w += cr - cr_block_size;
1214     for (size_t x = 0; x < w; x++) {
1215       for (size_t y = 0; y < h; y++) {
1216         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1217           const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1218           *packed_w++ = kv;
1219         }
1220         packed_w += cr - cr_block_size;
1221       }
1222     }
1223   }
1224 }
1225 
xnn_pack_f16_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1226 void xnn_pack_f16_dwconv_hwg_w(
1227   size_t h,
1228   size_t w,
1229   size_t c,
1230   size_t cr,
1231   const uint16_t* k,
1232   const uint16_t* b,
1233   uint16_t* packed_w,
1234   const void* params)
1235 {
1236   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1237     const size_t cr_block_size = min(c - cr_block_start, cr);
1238     if XNN_LIKELY(b != NULL) {
1239       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1240         *packed_w++ = b[cr_block_start + cr_block_offset];
1241       }
1242     } else {
1243       size_t n = cr_block_size;
1244       do {
1245         *packed_w++ = 0;
1246       } while (--n != 0);
1247     }
1248     packed_w += cr - cr_block_size;
1249     for (size_t x = 0; x < w; x++) {
1250       for (size_t y = 0; y < h; y++) {
1251         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1252           const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1253           *packed_w++ = kv;
1254         }
1255         packed_w += cr - cr_block_size;
1256       }
1257     }
1258   }
1259 }
1260 
xnn_pack_qu8_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const uint8_t * k,const int32_t * b,void * packed_w,const struct xnn_qu8_packing_params * params)1261 void xnn_pack_qu8_dwconv_hwg_w(
1262   size_t h,
1263   size_t w,
1264   size_t c,
1265   size_t cr,
1266   const uint8_t* k,
1267   const int32_t* b,
1268   void* packed_w,
1269   const struct xnn_qu8_packing_params* params)
1270 {
1271   const int32_t izp = (int32_t) params->input_zero_point;
1272   const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1273   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1274     const size_t cr_block_size = min(c - cr_block_start, cr);
1275     int32_t* packed_b = (int32_t*) packed_w;
1276     if XNN_LIKELY(b != NULL) {
1277       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1278         *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1279         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1280       }
1281     } else {
1282       size_t n = cr_block_size;
1283       do {
1284         *((int32_t*) packed_w) = boff;
1285         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1286       } while (--n != 0);
1287     }
1288     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1289     for (size_t x = 0; x < w; x++) {
1290       for (size_t y = 0; y < h; y++) {
1291         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1292           const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1293           packed_b[cr_block_offset] -= (int32_t) kv * izp;
1294           *((uint8_t*) packed_w) = kv;
1295           packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1296         }
1297         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1298       }
1299     }
1300   }
1301 }
1302 
xnn_pack_qs8_dwconv_hwg_w(size_t h,size_t w,size_t c,size_t cr,const int8_t * k,const int32_t * b,void * packed_w,const struct xnn_qs8_packing_params * params)1303 void xnn_pack_qs8_dwconv_hwg_w(
1304   size_t h,
1305   size_t w,
1306   size_t c,
1307   size_t cr,
1308   const int8_t* k,
1309   const int32_t* b,
1310   void* packed_w,
1311   const struct xnn_qs8_packing_params* params)
1312 {
1313   const int32_t izp = (int32_t) params->input_zero_point;
1314   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1315     const size_t cr_block_size = min(c - cr_block_start, cr);
1316     int32_t* packed_b = (int32_t*) packed_w;
1317     if XNN_LIKELY(b != NULL) {
1318       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1319         *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1320         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1321       }
1322     } else {
1323       size_t n = cr_block_size;
1324       do {
1325         *((int32_t*) packed_w) = 0;
1326         packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1327       } while (--n != 0);
1328     }
1329     packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1330     for (size_t x = 0; x < w; x++) {
1331       for (size_t y = 0; y < h; y++) {
1332         for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1333           const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1334           packed_b[cr_block_offset] -= (int32_t) kv * izp;
1335           *((int8_t*) packed_w) = kv;
1336           packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1337         }
1338         packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1339       }
1340     }
1341   }
1342 }
1343 
xnn_pack_f32_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const float * k,float * packed_w,const void * params)1344 void xnn_pack_f32_gemminc_goi_w(
1345   size_t g,
1346   size_t nc,
1347   size_t kc,
1348   size_t nr,
1349   size_t kr,
1350   size_t sr,
1351   const float* k,
1352   float* packed_w,
1353   const void* params)
1354 {
1355   const size_t skr = sr * kr;
1356   const size_t skc = round_down_po2(kc, skr);
1357   const size_t sr_mask = (sr - 1) * kr;
1358   do {
1359     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1360       const size_t nr_block_size = min(nc - nr_block_start, nr);
1361 
1362       for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1363         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1364           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1365             *packed_w++ =
1366               k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1367           }
1368         }
1369         packed_w += (nr - nr_block_size) * kr;
1370       }
1371 
1372       for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1373         const size_t kr_block_size = min(kc - kr_block_start, kr);
1374         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1375           for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1376             *packed_w++ =
1377               k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1378           }
1379           packed_w += kr - kr_block_size;
1380         }
1381         packed_w += (nr - nr_block_size) * kr;
1382       }
1383     }
1384     k += nc * kc;
1385   } while (--g != 0);
1386 }
1387 
xnn_pack_f16_gemminc_goi_w(size_t g,size_t nc,size_t kc,size_t nr,size_t kr,size_t sr,const uint16_t * k,uint16_t * packed_w,const void * params)1388 void xnn_pack_f16_gemminc_goi_w(
1389   size_t g,
1390   size_t nc,
1391   size_t kc,
1392   size_t nr,
1393   size_t kr,
1394   size_t sr,
1395   const uint16_t* k,
1396   uint16_t* packed_w,
1397   const void* params)
1398 {
1399   const size_t skr = sr * kr;
1400   const size_t skc = round_down_po2(kc, skr);
1401   const size_t sr_mask = (sr - 1) * kr;
1402   do {
1403     for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1404       const size_t nr_block_size = min(nc - nr_block_start, nr);
1405 
1406       for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1407         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1408           for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1409             *packed_w++ =
1410               k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1411           }
1412         }
1413         packed_w += (nr - nr_block_size) * kr;
1414       }
1415 
1416       for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1417         const size_t kr_block_size = min(kc - kr_block_start, kr);
1418         for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1419           for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1420             *packed_w++ =
1421               k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1422           }
1423           packed_w += kr - kr_block_size;
1424         }
1425         packed_w += (nr - nr_block_size) * kr;
1426       }
1427     }
1428     k += nc * kc;
1429   } while (--g != 0);
1430 }
1431 
xnn_pack_f32_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const float * k,const float * b,float * packed_w,const void * params)1432 void xnn_pack_f32_dconv_oki_w(
1433   size_t nc,
1434   size_t kc,
1435   size_t nr,
1436   size_t kh,
1437   size_t kw,
1438   const float* k,
1439   const float* b,
1440   float* packed_w,
1441   const void* params)
1442 {
1443   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1444     const size_t nr_block_size = min(nc - nr_block_start, nr);
1445     if XNN_LIKELY(b != NULL) {
1446       for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1447         *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1448       }
1449     } else {
1450       size_t n = nr;
1451       do {
1452         *packed_w++ = 0.0f;
1453       } while (--n != 0);
1454     }
1455 
1456     for (size_t kx = 0; kx < kw; kx++) {
1457       for (size_t c = 0; c < kc; c++) {
1458         for (size_t ky = 0; ky < kh; ky++) {
1459           for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1460             *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1461           }
1462         }
1463       }
1464     }
1465     if XNN_UNPREDICTABLE(b != NULL) {
1466       b += nr;
1467     }
1468   }
1469 }
1470 
xnn_pack_f16_dconv_oki_w(size_t nc,size_t kc,size_t nr,size_t kh,size_t kw,const uint16_t * k,const uint16_t * b,uint16_t * packed_w,const void * params)1471 void xnn_pack_f16_dconv_oki_w(
1472   size_t nc,
1473   size_t kc,
1474   size_t nr,
1475   size_t kh,
1476   size_t kw,
1477   const uint16_t* k,
1478   const uint16_t* b,
1479   uint16_t* packed_w,
1480   const void* params)
1481 {
1482   for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1483     const size_t nr_block_size = min(nc - nr_block_start, nr);
1484     if XNN_LIKELY(b != NULL) {
1485       for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1486         *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1487       }
1488     } else {
1489       size_t n = nr;
1490       do {
1491         *packed_w++ = 0;
1492       } while (--n != 0);
1493     }
1494 
1495     for (size_t kx = 0; kx < kw; kx++) {
1496       for (size_t c = 0; c < kc; c++) {
1497         for (size_t ky = 0; ky < kh; ky++) {
1498           for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1499             *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1500           }
1501         }
1502       }
1503     }
1504     if XNN_UNPREDICTABLE(b != NULL) {
1505       b += nr;
1506     }
1507   }
1508 }
1509 
xnn_pack_f32_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1510 void xnn_pack_f32_chw_dwconv_ghw_w(
1511   size_t kernel_size,
1512   size_t groups,
1513   const float* kernel,
1514   const float* bias,
1515   float* packed_weights,
1516   const void* params)
1517 {
1518   for (size_t g = 0; g < groups; g++) {
1519     if XNN_LIKELY(bias != NULL) {
1520       *packed_weights = *bias++;
1521     } else {
1522       *packed_weights = 0.0f;
1523     }
1524     packed_weights += 1;
1525     for (size_t i = 0; i < kernel_size; i++) {
1526       *packed_weights++ = kernel[g * kernel_size + i];
1527     }
1528   }
1529 }
1530 
xnn_pack_f16_chw_dwconv_ghw_w(size_t kernel_size,size_t groups,const uint16_t * kernel,const uint16_t * bias,uint16_t * packed_weights,const void * params)1531 void xnn_pack_f16_chw_dwconv_ghw_w(
1532   size_t kernel_size,
1533   size_t groups,
1534   const uint16_t* kernel,
1535   const uint16_t* bias,
1536   uint16_t* packed_weights,
1537   const void* params)
1538 {
1539   for (size_t g = 0; g < groups; g++) {
1540     if XNN_LIKELY(bias != NULL) {
1541       *packed_weights = *bias++;
1542     } else {
1543       *packed_weights = 0;
1544     }
1545     packed_weights += 1;
1546     for (size_t i = 0; i < kernel_size; i++) {
1547       *packed_weights++ = kernel[g * kernel_size + i];
1548     }
1549   }
1550 }
1551 
xnn_pack_f32_chw_dwconv_hwg_w(size_t kernel_size,size_t groups,const float * kernel,const float * bias,float * packed_weights,const void * params)1552 void xnn_pack_f32_chw_dwconv_hwg_w(
1553   size_t kernel_size,
1554   size_t groups,
1555   const float* kernel,
1556   const float* bias,
1557   float* packed_weights,
1558   const void* params)
1559 {
1560   for (size_t g = 0; g < groups; g++) {
1561     if XNN_LIKELY(bias != NULL) {
1562       *packed_weights = *bias++;
1563     } else {
1564       *packed_weights = 0.0f;
1565     }
1566     packed_weights += 1;
1567     for (size_t i = 0; i < kernel_size; i++) {
1568       *packed_weights++ = kernel[i * groups + g];
1569     }
1570   }
1571 }
1572 
xnn_pack_f32_vmulcaddc_w(size_t c,size_t cr,const float * s,const float * b,float * packed_w,const void * params)1573 void xnn_pack_f32_vmulcaddc_w(
1574   size_t c,
1575   size_t cr,
1576   const float* s,
1577   const float* b,
1578   float* packed_w,
1579   const void* params)
1580 {
1581   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1582     const size_t cr_block_size = min(c - cr_block_start, cr);
1583     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1584       *packed_w++ = s[cr_block_start + cr_block_offset];
1585     }
1586     packed_w += cr - cr_block_size;
1587     if XNN_LIKELY(b != NULL) {
1588       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1589         *packed_w++ = b[cr_block_start + cr_block_offset];
1590       }
1591     } else {
1592       size_t n = cr_block_size;
1593       do {
1594         *packed_w++ = 0.0f;
1595       } while (--n != 0);
1596     }
1597     packed_w += cr - cr_block_size;
1598   }
1599 }
1600 
xnn_pack_f16_vmulcaddc_w(size_t c,size_t cr,const uint16_t * s,const uint16_t * b,uint16_t * packed_w,const void * params)1601 void xnn_pack_f16_vmulcaddc_w(
1602   size_t c,
1603   size_t cr,
1604   const uint16_t* s,
1605   const uint16_t* b,
1606   uint16_t* packed_w,
1607   const void* params)
1608 {
1609   for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1610     const size_t cr_block_size = min(c - cr_block_start, cr);
1611     for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1612       *packed_w++ = s[cr_block_start + cr_block_offset];
1613     }
1614     packed_w += cr - cr_block_size;
1615     if XNN_LIKELY(b != NULL) {
1616       for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1617         *packed_w++ = b[cr_block_start + cr_block_offset];
1618       }
1619     } else {
1620       size_t n = cr_block_size;
1621       do {
1622         *packed_w++ = 0;
1623       } while (--n != 0);
1624     }
1625     packed_w += cr - cr_block_size;
1626   }
1627 }
1628