1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gemm/c2-neon-mull-padal-dup.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/gemm.h>
15 #include <xnnpack/math.h>
16
17
xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(
19 size_t mr,
20 size_t nc,
21 size_t kc,
22 const int8_t* restrict a,
23 size_t a_stride,
24 const void* restrict w,
25 int8_t* restrict c,
26 size_t cm_stride,
27 size_t cn_stride,
28 const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
29 {
30 assert(mr != 0);
31 assert(mr <= 4);
32 assert(nc != 0);
33 assert(kc != 0);
34 assert(kc % sizeof(int8_t) == 0);
35 assert(a != NULL);
36 assert(w != NULL);
37 assert(c != NULL);
38
39 kc = round_up_po2(kc, 2);
40 const int8_t* a0 = a;
41 int8_t* c0 = c;
42 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
43 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
44 if XNN_UNPREDICTABLE(mr < 2) {
45 a1 = a0;
46 c1 = c0;
47 }
48 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
49 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
50 if XNN_UNPREDICTABLE(mr <= 2) {
51 a2 = a1;
52 c2 = c1;
53 }
54 const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
55 int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
56 if XNN_UNPREDICTABLE(mr != 4) {
57 a3 = a2;
58 c3 = c2;
59 }
60
61 do {
62 int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
63 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
64 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
65 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
66 int32x4_t vacc1x0123 = vacc0x0123;
67 int32x4_t vacc1x4567 = vacc0x4567;
68 int32x4_t vacc1x89AB = vacc0x89AB;
69 int32x4_t vacc1xCDEF = vacc0xCDEF;
70 int32x4_t vacc2x0123 = vacc0x0123;
71 int32x4_t vacc2x4567 = vacc0x4567;
72 int32x4_t vacc2x89AB = vacc0x89AB;
73 int32x4_t vacc2xCDEF = vacc0xCDEF;
74 int32x4_t vacc3x0123 = vacc0x0123;
75 int32x4_t vacc3x4567 = vacc0x4567;
76 int32x4_t vacc3x89AB = vacc0x89AB;
77 int32x4_t vacc3xCDEF = vacc0xCDEF;
78
79 size_t k = kc;
80
81 while (k >= 16 * sizeof(int8_t)) {
82 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
83 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
84 const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
85 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
86 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
87 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
88 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
89 const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
90
91 const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
92 const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
93 const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
94 const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
95 const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
96 const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
97 const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
98 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
99 const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
100 const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
101 const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
102 const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
103 const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
104 const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
105 const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
106 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
107
108 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
109 int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
110 int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
111 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
112 const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
113 vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
114 vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
115 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
116 vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
117 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
118 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
119 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
120 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
121 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
122 int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
123 int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
124 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
125 const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
126 vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
127 vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
128 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
129 vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
130 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
131 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
132 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
133 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
134 int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
135 int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
136 int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
137 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
138 const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
139 vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
140 vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
141 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
142 vprod3x89ABc0 = vmlal_s8(vprod3x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
143 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
144 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
145 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
146 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
147 int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
148 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
149 int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
150 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
151 const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
152 vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
153 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
154 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
155 vprod3xCDEFc0 = vmlal_s8(vprod3xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
156 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
157 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
158 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
159 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
160 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
161 int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
162 int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
163 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
164 const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
165 vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
166 vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
167 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
168 vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
169 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
170 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
171 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
172 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
173 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
174 int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
175 int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
176 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
177 const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
178 vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
179 vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
180 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
181 vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
182 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
183 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
184 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
185 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
186 int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
187 int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
188 int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
189 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
190 const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
191 vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
192 vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
193 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
194 vprod3x89ABc1 = vmlal_s8(vprod3x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
195 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
196 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
197 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
198 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
199 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
200 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
201 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
202 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
203 const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
204 vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
205 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
206 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
207 vprod3xCDEFc1 = vmlal_s8(vprod3xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
208 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
209 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
210 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
211 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
212 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
213 int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
214 int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
215 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
216 const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
217 vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
218 vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
219 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
220 vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
221 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
222 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
223 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
224 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
225 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
226 int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
227 int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
228 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
229 const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
230 vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
231 vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
232 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
233 vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
234 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
235 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
236 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
237 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
238 int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
239 int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
240 int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
241 int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
242 const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
243 vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
244 vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
245 vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
246 vprod3x89ABc2 = vmlal_s8(vprod3x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
247 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
248 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
249 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
250 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
251 int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
252 int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
253 int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
254 int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
255 const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
256 vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
257 vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
258 vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
259 vprod3xCDEFc2 = vmlal_s8(vprod3xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
260 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
261 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
262 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
263 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
264 int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
265 int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
266 int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
267 int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
268 const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
269 vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
270 vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
271 vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
272 vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
273 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
274 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
275 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
276 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
277 int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
278 int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
279 int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
280 int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
281 const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
282 vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
283 vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
284 vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
285 vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
286 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
287 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
288 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
289 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
290 int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
291 int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
292 int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
293 int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
294 const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
295 vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
296 vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
297 vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
298 vprod3x89ABc3 = vmlal_s8(vprod3x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
299 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
300 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
301 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
302 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
303 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
304 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
305 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
306 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
307 const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
308 vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
309 vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
310 vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
311 vprod3xCDEFc3 = vmlal_s8(vprod3xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
312 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
313 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
314 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
315 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
316
317 k -= 16 * sizeof(int8_t);
318 }
319
320 if (k >= 8 * sizeof(int8_t)) {
321 const int8x8_t va0 = vld1_s8(a0); a0 += 8;
322 const int8x8_t va1 = vld1_s8(a1); a1 += 8;
323 const int8x8_t va2 = vld1_s8(a2); a2 += 8;
324 const int8x8_t va3 = vld1_s8(a3); a3 += 8;
325
326 const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
327 const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
328 const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
329 const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
330 const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
331 const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
332 const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
333 const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
334 const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
335 const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
336 const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
337 const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
338 const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
339 const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
340 const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
341 const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
342
343 const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
344 const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
345 const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
346 const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
347 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
348 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
349 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
350 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
351 const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
352 const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
353 const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
354 const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
355 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
356 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
357 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
358 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
359 const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
360 const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
361 const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
362 const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
363 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
364 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
365 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
366 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
367 const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
368 const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
369 const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
370 const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
371 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
372 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
373 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
374 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
375 const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
376 const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
377 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
378 const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
379 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
380 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
381 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
382 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
383 const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
384 const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
385 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
386 const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
387 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
388 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
389 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
390 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
391 const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
392 const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
393 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
394 const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
395 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
396 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
397 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
398 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
399 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
400 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
401 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
402 const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
403 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
404 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
405 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
406 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
407 const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
408 const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
409 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
410 const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
411 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
412 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
413 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
414 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
415 const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
416 const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
417 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
418 const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
419 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
420 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
421 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
422 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
423 const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
424 const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
425 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
426 const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
427 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
428 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
429 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
430 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
431 const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
432 const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
433 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
434 const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
435 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
436 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
437 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
438 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
439 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
440 const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
441 const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
442 const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
443 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
444 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
445 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
446 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
447 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
448 const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
449 const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
450 const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
451 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
452 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
453 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
454 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
455 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
456 const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
457 const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
458 const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
459 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
460 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
461 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
462 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
463 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
464 const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
465 const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
466 const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
467 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
468 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
469 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
470 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
471
472 k -= 8 * sizeof(int8_t);
473 }
474
475 if XNN_UNLIKELY(k != 0) {
476 const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
477 const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
478 const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
479 const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
480
481 const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
482 const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
483 const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
484 const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
485
486 const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
487 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
488 const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
489 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
490 const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
491 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
492 const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
493 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
494 const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
495 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
496 const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
497 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
498 const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
499 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
500 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
501 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
502 const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
503 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
504 const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
505 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
506 const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
507 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
508 const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
509 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
510 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
511 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
512 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
513 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
514 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
515 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
516 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
517 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
518
519 if (k > 2 * sizeof(int8_t)) {
520 const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
521 const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
522 const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
523 const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
524
525 const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
526 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
527 const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
528 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
529 const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
530 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
531 const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
532 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
533 const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
534 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
535 const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
536 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
537 const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
538 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
539 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
540 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
541 const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
542 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
543 const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
544 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
545 const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
546 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
547 const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
548 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
549 const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
550 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
551 const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
552 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
553 const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
554 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
555 const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
556 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
557
558 if (k > 4 * sizeof(int8_t)) {
559 const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
560 const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
561 const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
562 const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
563
564 const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
565 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
566 const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
567 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
568 const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
569 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
570 const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
571 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
572 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
573 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
574 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
575 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
576 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
577 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
578 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
579 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
580 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
581 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
582 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
583 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
584 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
585 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
586 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
587 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
588 const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
589 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
590 const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
591 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
592 const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
593 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
594 const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
595 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
596 }
597 }
598 }
599 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
600 vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
601 vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
602 vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
603 vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
604 vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
605 vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
606 vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
607 vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
608 vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
609 vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
610 vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
611 vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
612 vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
613 vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
614 vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
615 vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
616
617 const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
618 const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
619 vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
620 vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
621 vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
622 vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
623 vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
624 vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
625 vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
626 vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
627 vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
628 vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
629 vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
630 vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
631 vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
632 vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
633 vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
634 vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
635
636 vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
637 vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
638 vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
639 vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
640 vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
641 vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
642 vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
643 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
644 vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
645 vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
646 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
647 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
648 vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
649 vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
650 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
651 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
652
653 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
654 #if XNN_ARCH_ARM64
655 const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
656 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
657 const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
658 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
659 const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
660 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
661 const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
662 const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
663
664 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
665 int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
666 int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
667 int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
668 #else
669 const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
670 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
671 const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
672 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
673 const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
674 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
675 const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
676 const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
677
678 int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
679 int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
680 int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
681 int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
682 #endif
683 const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
684 const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
685
686 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
687 vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
688 vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
689 vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
690
691 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
692 vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
693 vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
694 vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
695
696 if (nc >= 16) {
697 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
698 vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
699 vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
700 vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
701
702 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
703 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
704 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
705 c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
706
707 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
708 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
709 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
710 a3 = (const int8_t*) ((uintptr_t) a3 - kc);
711
712 nc -= 16;
713 } else {
714 int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
715 int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
716 if (nc & 8) {
717 vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
718 vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
719 vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
720 vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
721 vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
722 vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
723 }
724 if (nc & 4) {
725 vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
726 vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
727 vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
728 vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
729 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
730 vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
731 }
732 if (nc & 2) {
733 vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
734 vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
735 vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
736 vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
737 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
738 vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
739 }
740 if (nc & 1) {
741 vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
742 vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
743 vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
744 vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
745 }
746
747 nc = 0;
748 }
749 } while (nc != 0);
750 }
751