1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // kernel_neon.h: a collection of NEON optimized kernels.
16 // Check in kernel_default.h which one(s) are actually used by default.
17 // Others are mere experiments; they are still covered by tests
18 // in case they might be useful some day.
19 
20 #ifndef GEMMLOWP_INTERNAL_KERNEL_NEON_H_
21 #define GEMMLOWP_INTERNAL_KERNEL_NEON_H_
22 
23 #include "kernel.h"
24 
25 #include <arm_neon.h>
26 #include <cassert>
27 
28 namespace gemmlowp {
29 
30 // The kernels here are specifically arm 32bit assembly, not arm 64bit.
31 #ifdef GEMMLOWP_NEON_32
32 
33 // Our main GEMM kernel.
34 struct NEON_32_Kernel12x4Depth2 : KernelBase {
35   typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
36                        KernelSideFormat<CellFormat<4, 2>, 1> >
37       Format;
38 
NameNEON_32_Kernel12x4Depth239   const char* Name() const override { return "NEON, 12x4, depth 2"; }
40 
41   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_32_Kernel12x4Depth242   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
43            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
44            const std::uint8_t* rhs_ptr, std::size_t start_depth,
45            std::size_t run_depth) const override {
46     ScopedProfilingLabel label("optimized kernel (NEON 12x4)");
47 
48 // For iOS assembler, the %= style of local labels cause compilation errors,
49 //  so use numerical ones instead. See
50 // http://stackoverflow.com/questions/3898435/labels-in-gcc-inline-assembly
51 // If you add any labels, remember to undef them at the end.
52 #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
53 #define GEMMLOWP_LABEL_BEFORE_LOOP "2"
54 #define GEMMLOWP_LABEL_LOOP "3"
55 #define GEMMLOWP_LABEL_AFTER_LOOP "4"
56 
57     assert(dst_row_stride == 1);
58     (void)dst_row_stride;
59     asm volatile(
60         // Overview of register layout:
61         //
62         // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
63         // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
64         // (q1--q3).
65         // A 12x4 block of accumulators is stored in 32bit in q4--q15.
66         //
67         //                   +-----+-----+-----+-----+
68         //                   |d0[0]|d0[1]|d0[2]|d0[3]|
69         //              Rhs  +-----+-----+-----+-----+
70         //                   |d1[0]|d1[1]|d1[2]|d1[3]|
71         //                   +-----+-----+-----+-----+
72         //
73         //                   |     |     |     |     |
74         //
75         //    Lhs            |     |     |     |     |
76         //
77         //  +--+--+ - - - -  +-----+-----+-----+-----+
78         //  |d2|d3|          | q4  | q5  | q6  | q7  |
79         //  |d2|d3|          | q4  | q5  | q6  | q7  |
80         //  |d2|d3|          | q4  | q5  | q6  | q7  |
81         //  |d2|d3|          | q4  | q5  | q6  | q7  |
82         //  +--+--+ - - - -  +-----+-----+-----+-----+
83         //  |d4|d5|          | q8  | q9  | q10 | q11 |
84         //  |d4|d5|          | q8  | q9  | q10 | q11 |
85         //  |d4|d5|          | q8  | q9  | q10 | q11 |
86         //  |d4|d5|          | q8  | q9  | q10 | q11 |
87         //  +--+--+ - - - -  +-----+-----+-----+-----+
88         //  |d6|d7|          | q12 | q13 | q14 | q15 |
89         //  |d6|d7|          | q12 | q13 | q14 | q15 |
90         //  |d6|d7|          | q12 | q13 | q14 | q15 |
91         //  |d6|d7|          | q12 | q13 | q14 | q15 |
92         //  +--+--+ - - - -  +-----+-----+-----+-----+
93         //
94         //                            Accumulator
95 
96         // Load 1 Rhs cell of size 2x4
97         "vld1.8 {d0}, [%[rhs_ptr]]!\n"
98         // Load 3 Lhs cells of size 4x2 each
99         "vld1.8 {d2}, [%[lhs_ptr]]!\n"
100         "vld1.8 {d4}, [%[lhs_ptr]]!\n"
101         "vld1.8 {d6}, [%[lhs_ptr]]!\n"
102 
103         // Check if start_depth==0 to decide whether we will clear
104         // accumulators or load existing accumulators.
105         "cmp %[start_depth], #0\n"
106 
107         // Multiply dst_col_stride by 4 == sizeof(int32) to use
108         // it as a byte offset below.
109         "lsl %[dst_col_stride], #2\n"
110 
111         "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
112         "f\n"
113 
114         // Load accumulators (start_depth != 0)
115         "mov r1, %[dst_ptr]\n"
116         "subs %[run_depth], #2\n"
117         "mov r0, r1\n"
118         "vld1.32 {d8, d9},   [r0]!\n"
119         "add r1, %[dst_col_stride]\n"
120         "vld1.32 {d16, d17}, [r0]!\n"
121         "vld1.32 {d24, d25}, [r0]\n"
122         "mov r0, r1\n"
123         "vld1.32 {d10, d11}, [r0]!\n"
124         "add r1, %[dst_col_stride]\n"
125         "vld1.32 {d18, d19}, [r0]!\n"
126         "vld1.32 {d26, d27}, [r0]\n"
127         "mov r0, r1\n"
128         "vld1.32 {d12, d13}, [r0]!\n"
129         "add r1, %[dst_col_stride]\n"
130         "vld1.32 {d20, d21}, [r0]!\n"
131         "vld1.32 {d28, d29}, [r0]\n"
132         "mov r0, r1\n"
133         "vld1.32 {d14, d15}, [r0]!\n"
134         "vld1.32 {d22, d23}, [r0]!\n"
135         "vld1.32 {d30, d31}, [r0]\n"
136 
137         "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
138 
139         GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
140         ":\n"
141 
142         // Clear accumulators (start_depth == 0)
143         "vmov.s32 q4, #0\n"
144         "subs %[run_depth], #2\n"
145         "vmov.s32 q8, q4\n"
146         "vmov.s32 q12, q4\n"
147         "vmov.s32 q5, q4\n"
148         "vmov.s32 q9, q4\n"
149         "vmov.s32 q13, q4\n"
150         "vmov.s32 q6, q4\n"
151         "vmov.s32 q10, q4\n"
152         "vmov.s32 q14, q4\n"
153         "vmov.s32 q7, q4\n"
154         "vmov.s32 q11, q4\n"
155         "vmov.s32 q15, q4\n"
156 
157         GEMMLOWP_LABEL_BEFORE_LOOP
158         ":\n"
159 
160         // If there are only two levels of depth, skip the loop.
161         "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
162 
163         GEMMLOWP_LABEL_LOOP
164         ":\n"
165         // Expand Lhs/Rhs cells to 16 bit.
166         // Note: moving theses vmovls further down to allow for
167         // longer data pipelining helps a little on A57 but is
168         // harmful on A53 --- It looks as if A53 doesn't like
169         // interleaving vmovl's into the vmlal's.
170         "vmovl.u8 q0, d0\n"
171         "vmovl.u8 q1, d2\n"
172         "vmovl.u8 q2, d4\n"
173         "vmovl.u8 q3, d6\n"
174 
175         // Multiply-accumulate, level of depth 0
176         "vmlal.u16 q4, d2, d0[0]\n"
177         "vmlal.u16 q5, d2, d0[1]\n"
178         "vmlal.u16 q6, d2, d0[2]\n"
179         "vmlal.u16 q7, d2, d0[3]\n"
180         "vldr d2, [%[lhs_ptr]]\n"
181         "vmlal.u16 q8, d4, d0[0]\n"
182         "vmlal.u16 q9, d4, d0[1]\n"
183         "vmlal.u16 q10, d4, d0[2]\n"
184         "vmlal.u16 q11, d4, d0[3]\n"
185         "vldr d4, [%[lhs_ptr], #8]\n"
186         "vmlal.u16 q12, d6, d0[0]\n"
187         "vmlal.u16 q13, d6, d0[1]\n"
188         "vmlal.u16 q14, d6, d0[2]\n"
189         "vmlal.u16 q15, d6, d0[3]\n"
190         "vldr d6, [%[lhs_ptr], #16]\n"
191         "vldr d0, [%[rhs_ptr]]\n"
192 
193         // Multiply-accumulate, level of depth 1
194         "vmlal.u16 q4, d3, d1[0]\n"
195         "vmlal.u16 q5, d3, d1[1]\n"
196         "add %[lhs_ptr], #24\n"
197         "vmlal.u16 q6, d3, d1[2]\n"
198         "vmlal.u16 q7, d3, d1[3]\n"
199         "add %[rhs_ptr], #8\n"
200         "vmlal.u16 q8, d5, d1[0]\n"
201         "vmlal.u16 q9, d5, d1[1]\n"
202         "subs %[run_depth], #2\n"
203         "vmlal.u16 q10, d5, d1[2]\n"
204         "vmlal.u16 q11, d5, d1[3]\n"
205         "vmlal.u16 q12, d7, d1[0]\n"
206         "vmlal.u16 q13, d7, d1[1]\n"
207         "vmlal.u16 q14, d7, d1[2]\n"
208         "vmlal.u16 q15, d7, d1[3]\n"
209 
210         "bne " GEMMLOWP_LABEL_LOOP "b\n"
211 
212         GEMMLOWP_LABEL_AFTER_LOOP
213         ":\n"
214 
215         // Do remaining arithmetic for the last 2 levels of depth.
216 
217         // Expand Lhs/Rhs cells to 16 bit.
218         "vmovl.u8 q0, d0\n"
219         "vmovl.u8 q1, d2\n"
220         "vmovl.u8 q2, d4\n"
221         "vmovl.u8 q3, d6\n"
222 
223         // Multiply-accumulate, level of depth 0
224         "vmlal.u16 q4, d2, d0[0]\n"
225         "vmlal.u16 q5, d2, d0[1]\n"
226         "vmlal.u16 q6, d2, d0[2]\n"
227         "vmlal.u16 q7, d2, d0[3]\n"
228         "vmlal.u16 q8, d4, d0[0]\n"
229         "vmlal.u16 q9, d4, d0[1]\n"
230         "vmlal.u16 q10, d4, d0[2]\n"
231         "vmlal.u16 q11, d4, d0[3]\n"
232         "vmlal.u16 q12, d6, d0[0]\n"
233         "vmlal.u16 q13, d6, d0[1]\n"
234         "vmlal.u16 q14, d6, d0[2]\n"
235         "vmlal.u16 q15, d6, d0[3]\n"
236 
237         // Multiply-accumulate, level of depth 1
238         "vmlal.u16 q4, d3, d1[0]\n"
239         "vmlal.u16 q5, d3, d1[1]\n"
240         "vmlal.u16 q6, d3, d1[2]\n"
241         "vmlal.u16 q7, d3, d1[3]\n"
242         "vmlal.u16 q8, d5, d1[0]\n"
243         "vmlal.u16 q9, d5, d1[1]\n"
244         "vmlal.u16 q10, d5, d1[2]\n"
245         "vmlal.u16 q11, d5, d1[3]\n"
246         "vmlal.u16 q12, d7, d1[0]\n"
247         "vmlal.u16 q13, d7, d1[1]\n"
248         "vmlal.u16 q14, d7, d1[2]\n"
249         "vmlal.u16 q15, d7, d1[3]\n"
250 
251         // Store accumulators
252         "mov r1, %[dst_ptr]\n"
253         "mov r0, r1\n"
254         "vst1.32 {d8, d9},   [r0]!\n"
255         "add r1, %[dst_col_stride]\n"
256         "vst1.32 {d16, d17}, [r0]!\n"
257         "vst1.32 {d24, d25}, [r0]\n"
258         "mov r0, r1\n"
259         "vst1.32 {d10, d11}, [r0]!\n"
260         "add r1, %[dst_col_stride]\n"
261         "vst1.32 {d18, d19}, [r0]!\n"
262         "vst1.32 {d26, d27}, [r0]\n"
263         "mov r0, r1\n"
264         "vst1.32 {d12, d13}, [r0]!\n"
265         "add r1, %[dst_col_stride]\n"
266         "vst1.32 {d20, d21}, [r0]!\n"
267         "vst1.32 {d28, d29}, [r0]\n"
268         "mov r0, r1\n"
269         "vst1.32 {d14, d15}, [r0]!\n"
270         "vst1.32 {d22, d23}, [r0]!\n"
271         "vst1.32 {d30, d31}, [r0]\n"
272         :  // outputs
273         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
274         [dst_ptr] "+r"(dst_ptr),
275         [run_depth] "+r"(run_depth)
276         :  // inputs
277         [start_depth] "r"(start_depth),
278         [dst_col_stride] "r"(dst_col_stride)
279         :  // clobbers
280         "cc", "memory", "r0", "r1",
281         // note: someone on internet says that quad registers are
282         // unsupported in the clobber list!
283         "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
284         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
285         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
286         "d31");
287 #undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
288 #undef GEMMLOWP_LABEL_BEFORE_LOOP
289 #undef GEMMLOWP_LABEL_LOOP
290 #undef GEMMLOWP_LABEL_AFTER_LOOP
291   }
292 };
293 
294 struct NEON_32_Kernel12x4Depth2Assuming12BitProducts : KernelBase {
295   typedef KernelFormat<
296       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
297       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
298       Format;
299 
NameNEON_32_Kernel12x4Depth2Assuming12BitProducts300   const char* Name() const override {
301     return "NEON, 12x4, depth 2, assuming 12-bit products";
302   }
303 
304   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_32_Kernel12x4Depth2Assuming12BitProducts305   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
306            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
307            const std::uint8_t* rhs_ptr, std::size_t start_depth,
308            std::size_t run_depth) const override {
309     ScopedProfilingLabel label(
310         "optimized kernel (NEON 12x4, assuming 12-bit products)");
311     assert(dst_row_stride == 1);
312     (void)dst_row_stride;
313 
314 // See comments above for why we need local numerical labels in our asm.
315 #define GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS "1"
316 #define GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT "2"
317 #define GEMMLOWP_LABEL_32 "3"
318 #define GEMMLOWP_LABEL_24 "4"
319 #define GEMMLOWP_LABEL_16 "5"
320 #define GEMMLOWP_LABEL_8 "6"
321 #define GEMMLOWP_LABEL_2 "7"
322 
323     // This kernel is special in that it uses local 16-bit accumulators.
324     // Because it assumes that each product fits in 12 bits, it can accumulate
325     // 16 products into a local 16-bit accumulator without risking overflow.
326     // At that point, it must accumulate these local 16-bit accumulators back
327     // into global 32-bit accumulators, which have to be stored in memory for
328     // lack of register space.
329     // This 12x4 block of global accumulators is laid out as 3 cells of size 4x4
330     // stored in diagonal-major order like this for the first 4x4 cell:
331     //
332     //   0   4   8  12
333     //  13   1   5   9
334     //  10  14   2   6
335     //   7  11  15   3
336     //
337     // and likewise for the 2nd  cell (16--31) and 3rd cell (32--47)
338     std::int32_t global_accumulators[3 * 4 * 4];
339     asm volatile(
340         // Compute stride between consecutive columns, in bytes
341         "mov r0, #4\n"  // multiply by 4 = sizeof(int32)
342         "mul %[dst_col_stride], r0\n"
343 
344         "cmp %[start_depth], #0\n"
345         "bne"
346         " " GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
347         "f\n"
348 
349         // If start_depth==0, we need to clear our global accumulators
350         "mov r0, %[global_accumulators]\n"
351         "vmov.s32 q8, #0\n"
352         "vmov.s32 q9, q8\n"
353         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
354         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
355         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
356         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
357         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
358         "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
359         "b " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
360         "f\n"
361 
362         // If start_depth!=0, we need to load our existing global accumulators
363         GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
364         ":\n"
365         // Load global accumulators from destination matrix, column-major
366         "mov r1, %[dst_ptr]\n"
367         "mov r0, %[dst_col_stride]\n"
368         "sub r0, #32\n"
369         "vld1.32 {d0,d1}, [r1]!\n"
370         "vld1.32 {d8,d9}, [r1]!\n"
371         "vld1.32 {d16,d17}, [r1], r0\n"
372         "vld1.32 {d2,d3}, [r1]!\n"
373         "vld1.32 {d10,d11}, [r1]!\n"
374         "vld1.32 {d18,d19}, [r1], r0\n"
375         "vld1.32 {d4,d5}, [r1]!\n"
376         "vld1.32 {d12,d13}, [r1]!\n"
377         "vld1.32 {d20,d21}, [r1], r0\n"
378         "vld1.32 {d6,d7}, [r1]!\n"
379         "vld1.32 {d14,d15}, [r1]!\n"
380         "vld1.32 {d22,d23}, [r1], r0\n"
381         // Now we need to convert the global accumulator registers to
382         // 4x4-block-wise diagonal-major order. What we effectively want to do
383         // is to rotate the rows, however the accumulators are stored in
384         // column-major order in registers. So we achieve this by
385         // transposing, rotating the registers, and transposing again each
386         // 4x4 block.
387         //
388         // Transpose 3 4x4 blocks separately
389         "vtrn.32 q0, q1\n"
390         "vtrn.32 q2, q3\n"
391         "vswp d1, d4\n"
392         "vswp d3, d6\n"
393         "vtrn.32 q4, q5\n"
394         "vtrn.32 q6, q7\n"
395         "vswp d9, d12\n"
396         "vswp d11, d14\n"
397         "vtrn.32 q8, q9\n"
398         "vtrn.32 q10, q11\n"
399         "vswp d17, d20\n"
400         "vswp d19, d22\n"
401         // Rotate the registers
402         "vext.32 q1, q1, q1, #1\n"
403         "vext.32 q2, q2, q2, #2\n"
404         "vext.32 q3, q3, q3, #3\n"
405         "vext.32 q5, q5, q5, #1\n"
406         "vext.32 q6, q6, q6, #2\n"
407         "vext.32 q7, q7, q7, #3\n"
408         "vext.32 q9, q9, q9, #1\n"
409         "vext.32 q10, q10, q10, #2\n"
410         "vext.32 q11, q11, q11, #3\n"
411         // Transpose again and store into our global accumulators
412         // buffer. These two operations are done at once using vst4.
413         "mov r0, %[global_accumulators]\n"
414         "vst4.32 {d0,d2,d4,d6}, [r0]!\n"
415         "vst4.32 {d1,d3,d5,d7}, [r0]!\n"
416         "vst4.32 {d8,d10,d12,d14}, [r0]!\n"
417         "vst4.32 {d9,d11,d13,d15}, [r0]!\n"
418         "vst4.32 {d16,d18,d20,d22}, [r0]!\n"
419         "vst4.32 {d17,d19,d21,d23}, [r0]!\n"
420 
421         /* Main loop */
422 
423         GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
424         ":\n"
425 
426     // Overview of register layout:
427     //
428     // Registers q4--q16 are the local 16-bit accumulators.
429     // However, each entry in the result matrix is represented
430     // by *two* local 16-bit accumulators: one for even levels
431     // of depth and one for odd levels of depth. These correspond
432     // to the scalars at even and odd indices within each q-register.
433     // Thus we effectively use 32 bits of register space for each
434     // entry in the result matrix. The accumulators register layout
435     // is the same as was described above for the global 32-bit
436     // accumulators (3 cells of size 4x4 in diagonal-major order)
437     // with the only difference that instead of 32bit values we have
438     // pairs of 16bit values.
439     //
440     // A 2x4 cell of Rhs is stored in 8bit in d0.
441     // A 12x2 block of 3 4x2 cells Lhs is stored in 8bit in d1--d3.
442     //
443     //                      +--------+--------+--------+--------+
444     //                      |d0[0]   |d0[2]   |d0[4]   |d0[6]   |
445     //                 Rhs  +--------+--------+--------+--------+
446     //                      |d0[1]   |d0[3]   |d0[5]   |d0[7]   |
447     //                      +--------+--------+--------+--------+
448     //
449     //                      |        |        |        |        |
450     //
451     //    Lhs               |        |        |        |        |
452     //
453     //  +-----+-----+ - - - +--------+--------+--------+--------+
454     //  |d1[0]|d1[1]|       |q4[0,1] |q5[0,1] |q6[0,1] |q7[0,1] |
455     //  |d1[2]|d1[3]|       |q7[2,3] |q4[2,3] |q5[2,3] |q6[2,3] |
456     //  |d1[4]|d1[5]|       |q6[4,5] |q7[4,5] |q4[4,5] |q5[4,5] |
457     //  |d1[6]|d1[7]|       |q5[6,7] |q6[6,7] |q7[6,7] |q4[6,7] |
458     //  +-----+-----+ - - - +--------+--------+--------+--------+
459     //  |d2[0]|d2[1]|       |q8[0,1] |q8[0,1] |q8[0,1] |q8[0,1] |
460     //  |d2[2]|d2[3]|       |q9[2,3] |q9[2,3] |q9[2,3] |q9[2,3] |
461     //  |d2[4]|d2[5]|       |q10[4,5]|q10[4,5]|q10[4,5]|q10[4,5]|
462     //  |d2[6]|d2[7]|       |q11[6,7]|q11[6,7]|q11[6,7]|q11[6,7]|
463     //  +-----+-----+ - - - +--------+--------+--------+--------+
464     //  |d3[0]|d3[1]|       |q12[0,1]|q12[0,1]|q12[0,1]|q12[0,1]|
465     //  |d3[2]|d3[3]|       |q13[2,3]|q13[2,3]|q13[2,3]|q13[2,3]|
466     //  |d3[4]|d3[5]|       |q14[4,5]|q14[4,5]|q14[4,5]|q14[4,5]|
467     //  |d3[6]|d3[7]|       |q15[6,7]|q15[6,7]|q15[6,7]|q15[6,7]|
468     //  +-----+-----+ - - - +--------+--------+--------+--------+
469     //
470     //                            Local 16-bit accumulators
471     //                         Note: 2 scalars per matrix entry
472 
473 #define GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH \
474   /* Load 3 Lhs cells of size 4x2 */          \
475   "vld1.8 {d1,d2,d3}, [%[lhs_ptr]:64]!\n"     \
476                                               \
477   /* Load 1 Rhs cell of size 2x4 */           \
478   "vld1.8 {d0}, [%[rhs_ptr]:64]!\n"           \
479                                               \
480   /* Multiply-accumulate */                   \
481   "vmlal.u8 q4, d1, d0\n"                     \
482   "vmlal.u8 q8, d2, d0\n"                     \
483   "vmlal.u8 q12, d3, d0\n"                    \
484   "vext.8 d0, d0, d0, #2\n"                   \
485   "vmlal.u8 q5, d1, d0\n"                     \
486   "vmlal.u8 q9, d2, d0\n"                     \
487   "vmlal.u8 q13, d3, d0\n"                    \
488   "vext.8 d0, d0, d0, #2\n"                   \
489   "vmlal.u8 q6, d1, d0\n"                     \
490   "vmlal.u8 q10, d2, d0\n"                    \
491   "vmlal.u8 q14, d3, d0\n"                    \
492   "vext.8 d0, d0, d0, #2\n"                   \
493   "vmlal.u8 q7, d1, d0\n"                     \
494   "vmlal.u8 q11, d2, d0\n"                    \
495   "vmlal.u8 q15, d3, d0\n"                    \
496                                               \
497   "sub %[run_depth], #2\n"
498 
499 #define GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH \
500   GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
501   GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
502   GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
503   GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
504 
505         // Clear local 16-bit accumulators
506         "vmov.s32 q4, #0\n"
507         "vmov.s32 q5, q4\n"
508         "vmov.s32 q6, q4\n"
509         "vmov.s32 q7, q4\n"
510         "vmov.s32 q8, q4\n"
511         "vmov.s32 q9, q4\n"
512         "vmov.s32 q10, q4\n"
513         "vmov.s32 q11, q4\n"
514         "vmov.s32 q12, q4\n"
515         "vmov.s32 q13, q4\n"
516         "vmov.s32 q14, q4\n"
517         "vmov.s32 q15, q4\n"
518 
519         // Select a suitable number of depth levels
520         // to process at this iteration. TODO (benoitjacob) I guess that
521         // someone who really knows asm should make this a jump table.
522         "cmp %[run_depth], #32\n"
523         "bge " GEMMLOWP_LABEL_32
524         "f\n"
525         "cmp %[run_depth], #24\n"
526         "bge " GEMMLOWP_LABEL_24
527         "f\n"
528         "cmp %[run_depth], #16\n"
529         "bge " GEMMLOWP_LABEL_16
530         "f\n"
531         "cmp %[run_depth], #8\n"
532         "bge " GEMMLOWP_LABEL_8
533         "f\n"
534         "b " GEMMLOWP_LABEL_2 "f\n"
535 
536         GEMMLOWP_LABEL_32
537         ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_24
538         ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_16
539         ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_8
540         ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
541             GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
542                 GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH GEMMLOWP_LABEL_2
543         ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
544 
545         // Accumulate the local accumulators into the global accumulators.
546         // This is about summing adjacent pairs of 16-bit scalars into
547         // single 32-bit scalars, so we use pairwise long addition (vpadal).
548         "mov r0, %[global_accumulators]\n"
549         "mov r1, %[global_accumulators]\n"
550         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
551         "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
552         "vpadal.u16 q0, q4\n"
553         "vpadal.u16 q1, q5\n"
554         "vpadal.u16 q2, q6\n"
555         "vpadal.u16 q3, q7\n"
556         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
557         "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
558         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
559         "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
560         "vpadal.u16 q0, q8\n"
561         "vpadal.u16 q1, q9\n"
562         "vpadal.u16 q2, q10\n"
563         "vpadal.u16 q3, q11\n"
564         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
565         "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
566         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
567         "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
568         "vpadal.u16 q0, q12\n"
569         "vpadal.u16 q1, q13\n"
570         "vpadal.u16 q2, q14\n"
571         "vpadal.u16 q3, q15\n"
572         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
573         "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
574 
575         // Loop.
576         "cmp %[run_depth], #0\n"
577         "bne " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
578         "b\n"
579 
580 #undef GEMMLOWP_CLEAR_LOCAL_ACCUMULATORS
581 #undef GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH
582 #undef GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
583 #undef GEMMLOWP_ADD_TO_GLOBAL_ACCUMULATORS
584 
585         /* end of main loop */
586 
587         // Store the global accumulators to the destination matrix
588         // (column-major)
589         // This is the reverse of the steps that we followed at the beginning
590         // when we load the global accumulators from the destination matrix.
591         // The problem is the same: how to convert 4x4 blocks
592         // between column-major and diagonal-major orders.
593         // Like above, we do this by rotating rows, and we achieve that by
594         // tranposing, rotating columns, and transposing again.
595         //
596         // Load and transpose 4x4 blocks of global accumulators
597         // These two steps are done at once by the vld4 instruction.
598         "mov r0, %[global_accumulators]\n"
599         "vld4.32 {d0,d2,d4,d6}, [r0]!\n"
600         "vld4.32 {d1,d3,d5,d7}, [r0]!\n"
601         "vld4.32 {d8,d10,d12,d14}, [r0]!\n"
602         "vld4.32 {d9,d11,d13,d15}, [r0]!\n"
603         "vld4.32 {d16,d18,d20,d22}, [r0]!\n"
604         "vld4.32 {d17,d19,d21,d23}, [r0]!\n"
605         // Rotate the rows of each 4x4 block
606         "vext.32 q1, q1, q1, #3\n"
607         "vext.32 q2, q2, q2, #2\n"
608         "vext.32 q3, q3, q3, #1\n"
609         "vext.32 q5, q5, q5, #3\n"
610         "vext.32 q6, q6, q6, #2\n"
611         "vext.32 q7, q7, q7, #1\n"
612         "vext.32 q9, q9, q9, #3\n"
613         "vext.32 q10, q10, q10, #2\n"
614         "vext.32 q11, q11, q11, #1\n"
615         // Transpose again each 4x4 block
616         "vtrn.32 q0, q1\n"
617         "vtrn.32 q2, q3\n"
618         "vswp d1, d4\n"
619         "vswp d3, d6\n"
620         "vtrn.32 q4, q5\n"
621         "vtrn.32 q6, q7\n"
622         "vswp d9, d12\n"
623         "vswp d11, d14\n"
624         "vtrn.32 q8, q9\n"
625         "vtrn.32 q10, q11\n"
626         "vswp d17, d20\n"
627         "vswp d19, d22\n"
628         // Store into the column-major destination matrix
629         "mov r1, %[dst_ptr]\n"
630         "mov r0, %[dst_col_stride]\n"
631         "sub r0, #32\n"
632         "vst1.32 {d0,d1}, [r1]!\n"
633         "vst1.32 {d8,d9}, [r1]!\n"
634         "vst1.32 {d16,d17}, [r1], r0\n"
635         "vst1.32 {d2,d3}, [r1]!\n"
636         "vst1.32 {d10,d11}, [r1]!\n"
637         "vst1.32 {d18,d19}, [r1], r0\n"
638         "vst1.32 {d4,d5}, [r1]!\n"
639         "vst1.32 {d12,d13}, [r1]!\n"
640         "vst1.32 {d20,d21}, [r1], r0\n"
641         "vst1.32 {d6,d7}, [r1]!\n"
642         "vst1.32 {d14,d15}, [r1]!\n"
643         "vst1.32 {d22,d23}, [r1], r0\n"
644         :  // outputs
645         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
646         [dst_ptr] "+r"(dst_ptr),
647         [run_depth] "+r"(run_depth)
648         :  // inputs
649         [start_depth] "r"(start_depth), [dst_col_stride] "r"(dst_col_stride),
650         [global_accumulators] "r"(&global_accumulators[0])
651         :  // clobbers
652         "cc", "memory", "r0", "r1",
653         // note: someone on internet says that quad registers are
654         // unsupported in the clobber list!
655         "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
656         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
657         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
658         "d31");
659 #undef GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
660 #undef GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
661 #undef GEMMLOWP_LABEL_32
662 #undef GEMMLOWP_LABEL_24
663 #undef GEMMLOWP_LABEL_16
664 #undef GEMMLOWP_LABEL_8
665 #undef GEMMLOWP_LABEL_2
666   }
667 };
668 
669 struct NEON_32bit_GEMM_Int8Operands_LhsNonzero : KernelBase {
670   typedef KernelFormat<
671       KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
672       KernelSideFormatInt8<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
673       Format;
NameNEON_32bit_GEMM_Int8Operands_LhsNonzero674   const char* Name() const override {
675     return "NEON, 4x2, depth 16, accumulating two within signed int16";
676   }
677 
678   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_32bit_GEMM_Int8Operands_LhsNonzero679   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
680            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
681            const std::uint8_t* rhs_ptr, std::size_t start_depth,
682            std::size_t run_depth) const override {
683     (void)dst_row_stride;
684 #define GEMMLOWP_LABEL_AFTER_LOOP "1"
685 #define GEMMLOWP_LABEL_LOOP "2"
686 #define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
687 #define GEMMLOWP_LABEL_STORE "4"
688     asm volatile(
689         // Multiply dst_col_stride by 4 == sizeof(int32) to use
690         // it as a byte offset below.
691         "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
692 
693         // Overview of register layout:
694         //
695         // A 2x16 block of Rhs is stored in 8 bit in d0--d3.
696         // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only
697         // half of the register space required, so we loop over these registers
698         // twice. Only half of it, a 2x16 block, is stored in d4--d7 at
699         // any given time.
700         //
701         // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit
702         // components which need to be horizontally-added at the end)
703         //
704         // The Lhs vectors are multiplied by the Rhs vectors with a widening
705         // multiply over the 8 first levels of depth, producing int16x8
706         // vectors of products for each position in the accumulator matrix.
707         // Here comes the special trick: since the operands are signed int8,
708         // their range being [ -2^7 , 2^7 ), their products are in range
709         // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
710         // without any risk of overflowing int16.
711         // We thus proceed with the 8 next levels of depth, multiplying
712         // again Lhs by Rhs, accumulating into this existing int16x8 vector.
713         //
714         // Only then, having processed 16 levels of depth, do we need to
715         // horizontally add these int16x8 accumulators into the final
716         // int32x4 accumulators.
717         //
718         // As we do not have enough registers to store all 16 int16x8
719         // temporary-16bit-accumulators, we have them cycle through q4--q7.
720         //
721         //
722         // Register layout (ignoring the q4--q7 temporary 16bit accumulators):
723         //
724         //                               +----+----+
725         //                               | d0 | d2 |
726         //                               | .  | .  |
727         //                               | .  | .  |
728         //                               | .  | .  |
729         //                       Rhs     +----+----+
730         //                               | d1 | d3 |
731         //                               | .  | .  |
732         //                               | .  | .  |
733         //                               | .  | .  |
734         //                               +----+----+
735         //
736         //                               |    |    |
737         //
738         //    Lhs                        |    |    |
739         //
740         //  +--------+--------+ - - - -  +----+----+
741         //  | d4 ... | d5 ... |          | q8 | q9 |
742         //  | d6 ... | d7 ... |          | q10| q11|
743         //  | d4 ... | d5 ... |          | q12| q13|
744         //  | d6 ... | d7 ... |          | q14| q15|
745         //  +--------+--------+ - - - -  +----+----+
746         //
747         //                               Accumulator
748         //
749 
750         // Clear accumulators, and, interleaved with it,
751         // initial loads of the first loop iteration,
752         // taken out of the loop so that in the loop itself we have
753         // optimal streaming of data from memory.
754         "vldr d0, [%[rhs_ptr], #0]\n"
755         "vmov.i32 q8, #0\n"
756         "vldr d4, [%[lhs_ptr], #0]\n"
757         "vmov.i32 q9, #0\n"
758         "vldr d2, [%[rhs_ptr], #16]\n"
759         "vmov.i32 q10, q8\n"
760         "vldr d6, [%[lhs_ptr], #16]\n"
761         "vmov.i32 q11, q8\n"
762         "vldr d1, [%[rhs_ptr], #8]\n"
763         "vmov.i32 q12, q8\n"
764         "vldr d5, [%[lhs_ptr], #8]\n"
765         "vmov.i32 q13, q8\n"
766         "vldr d3, [%[rhs_ptr], #24]\n"
767         "vmov.i32 q14, q8\n"
768         "vldr d7, [%[lhs_ptr], #24]\n"
769         "vmov.i32 q15, q8\n"
770 
771         // General loop.
772         GEMMLOWP_LABEL_LOOP
773         ":\n"
774 
775         // Multiply 8 first levels of depth.
776         "vmull.s8    q4,  d0,  d4\n"
777         "add %[rhs_ptr], %[rhs_ptr], #32\n"
778         "vmull.s8    q5,  d2,  d4\n"
779         "vldr d4, [%[lhs_ptr], #32]\n"
780         "vmull.s8    q6,  d0,  d6\n"
781         "vmull.s8    q7,  d2,  d6\n"
782         "vldr d6, [%[lhs_ptr], #48]\n"
783 
784         // Multiply-accumulate second-half, again into the same
785         // 16bit local accumulator registers. This is where we
786         // take advantage of having int8 instead of uint8 and therefore
787         // being able to accumulate two products into int16.
788         "vmlal.s8    q4,  d1,  d5\n"
789         "vmlal.s8    q5,  d3,  d5\n"
790         "vldr d5, [%[lhs_ptr], #40]\n"
791         "vmlal.s8    q6,  d1,  d7\n"
792         "vmlal.s8    q7,  d3,  d7\n"
793         "vldr d7, [%[lhs_ptr], #56]\n"
794 
795         // Add pairwise, accumulate into 32-bit accumulators.
796         "vpadal.s16   q8,  q4\n"
797         "add %[lhs_ptr], %[lhs_ptr], #64\n"
798         "vpadal.s16   q9,  q5\n"
799         "subs %[run_depth], %[run_depth], #16\n"
800         "vpadal.s16   q10, q6\n"
801         "vpadal.s16   q11, q7\n"
802 
803         "beq " GEMMLOWP_LABEL_AFTER_LOOP
804         "f\n"
805 
806         // Multiply first half.
807         "vmull.s8    q4,  d0,  d4\n"
808         "vmull.s8    q5,  d2,  d4\n"
809         "vldr d4, [%[lhs_ptr], #0]\n"
810         "vmull.s8    q6,  d0,  d6\n"
811         "vldr d0, [%[rhs_ptr], #0]\n"
812         "vmull.s8    q7,  d2,  d6\n"
813         "vldr d2, [%[rhs_ptr], #16]\n"
814 
815         // Multiply-accumulate second-half, again into the same
816         // 16bit local accumulator registers. This is where we
817         // take advantage of having int8 instead of uint8 and therefore
818         // being able to accumulate two products into int16.
819         "vmlal.s8    q4,  d1,  d5\n"
820         "vldr d6, [%[lhs_ptr], #16]\n"
821         "vmlal.s8    q5,  d3,  d5\n"
822         "vldr d5, [%[lhs_ptr], #8]\n"
823         "vmlal.s8    q6,  d1,  d7\n"
824         "vldr d1, [%[rhs_ptr], #8]\n"
825         "vmlal.s8    q7,  d3,  d7\n"
826         "vldr d3, [%[rhs_ptr], #24]\n"
827 
828         // Add pairwise, accumulate into 32-bit accumulators.
829         "vpadal.s16   q12, q4\n"
830         "vldr d7, [%[lhs_ptr], #24]\n"
831         "vpadal.s16   q13, q5\n"
832         "vpadal.s16   q14, q6\n"
833         "vpadal.s16   q15, q7\n"
834 
835         "b " GEMMLOWP_LABEL_LOOP "b\n"
836 
837         GEMMLOWP_LABEL_AFTER_LOOP
838         ":\n"
839 
840         // Multiply first half.
841         "vmull.s8    q4,  d0,  d4\n"
842         "vmull.s8    q5,  d2,  d4\n"
843         "vmull.s8    q6,  d0,  d6\n"
844         "vmull.s8    q7,  d2,  d6\n"
845 
846         // Multiply-accumulate second-half, again into the same
847         // 16bit local accumulator registers. This is where we
848         // take advantage of having int8 instead of uint8 and therefore
849         // being able to accumulate two products into int16.
850         "vmlal.s8    q4,  d1,  d5\n"
851         "vmlal.s8    q5,  d3,  d5\n"
852         "vmlal.s8    q6,  d1,  d7\n"
853         "vmlal.s8    q7,  d3,  d7\n"
854 
855         // Add pairwise, accumulate into 32-bit accumulators.
856         "vpadal.s16   q12, q4\n"
857         "vpadal.s16   q13, q5\n"
858         "vpadal.s16   q14, q6\n"
859         "vpadal.s16   q15, q7\n"
860         "cmp %[start_depth], #0\n"
861 
862         // Reduce 32bit accumulators horizontally.
863         "vpadd.s32 d0, d16, d17\n"
864         "vpadd.s32 d1, d18, d19\n"
865         "vpadd.s32 d2, d20, d21\n"
866         "vpadd.s32 d3, d22, d23\n"
867         "vpadd.s32 d4, d24, d25\n"
868         "vpadd.s32 d5, d26, d27\n"
869         "vpadd.s32 d6, d28, d29\n"
870         "vpadd.s32 d7, d30, d31\n"
871 
872         "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
873         "f\n"
874 
875         // Reduce 32bit accumulators horizontally, second pass
876         // (each pass adds pairwise. we need to add 4-wise).
877         "vpadd.s32 d8, d0, d2\n"
878         "vpadd.s32 d9, d4, d6\n"
879         "vpadd.s32 d10, d1, d3\n"
880         "vpadd.s32 d11, d5, d7\n"
881 
882         "b " GEMMLOWP_LABEL_STORE "f\n"
883 
884         GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
885         ":\n"
886 
887         // Reduce 32bit accumulators horizontally, second pass
888         // (each pass adds pairwise. we need to add 4-wise),
889         // and load destination values from memory.
890         "mov r0, %[dst_ptr]\n"
891         "vld1.32 {d16, d17}, [r0], %[dst_col_stride]\n"
892         "vpadd.s32 d8, d0, d2\n"
893         "vpadd.s32 d9, d4, d6\n"
894         "vld1.32 {d18, d19}, [r0]\n"
895         "vpadd.s32 d10, d1, d3\n"
896         "vpadd.s32 d11, d5, d7\n"
897 
898         // Add horizontally-reduced accumulators into
899         // the values loaded from memory
900         "vadd.s32 q4, q8, q4\n"
901         "vadd.s32 q5, q9, q5\n"
902 
903         GEMMLOWP_LABEL_STORE
904         ":\n"
905         // Store back into memory
906         "mov r0, %[dst_ptr]\n"
907         "vst1.32 {d8, d9}, [r0], %[dst_col_stride]\n"
908         "vst1.32 {d10, d11}, [r0]\n"
909         :  // outputs
910         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
911         [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth)
912         :  // inputs
913         [start_depth] "r"(start_depth),
914         [dst_col_stride] "r"(dst_col_stride)
915         :  // clobbers
916         "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
917         "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
918         "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
919         "d28", "d29", "d30", "d31");
920 #undef GEMMLOWP_LABEL_LOOP
921 #undef GEMMLOWP_LABEL_AFTER_LOOP
922 #undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
923 #undef GEMMLOWP_LABEL_STORE
924   }
925 };
926 
927 // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
928 // requires that user inputs were originally int8. This avoids the uint8->int8
929 // conversion in the pack step.
930 struct NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs
931     : NEON_32bit_GEMM_Int8Operands_LhsNonzero {
932   typedef KernelFormat<
933       KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
934       KernelSideFormatInt8Inputs<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
935       Format;
936 };
937 
938 #endif  // GEMMLOWP_NEON_32
939 
940 // The kernels here are specifically arm 64bit assembly, not arm 32bit.
941 #ifdef GEMMLOWP_NEON_64
942 
943 struct NEON_64bit_GEMM_Int8Operands_LhsNonzero : KernelBase {
944   typedef KernelFormat<
945       KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
946       KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
947       Format;
NameNEON_64bit_GEMM_Int8Operands_LhsNonzero948   const char* Name() const override {
949     return "NEON, 4x4, depth 16, accumulating two within signed int16";
950   }
951 
952   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_64bit_GEMM_Int8Operands_LhsNonzero953   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
954            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
955            const std::uint8_t* rhs_ptr, std::size_t start_depth,
956            std::size_t run_depth) const override {
957     (void)dst_row_stride;
958 #define GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "1"
959 #define GEMMLOWP_LABEL_LOOP "2"
960 #define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
961 #define GEMMLOWP_LABEL_STORE "4"
962     asm volatile(
963         // Clear accumulators, and, interleaved with it,
964         // initial loads of the first loop iteration,
965         // taken out of the loop so that in the loop itself we have
966         // optimal streaming of data from memory.
967         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
968         "dup v16.4s, wzr\n"
969         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
970         "dup v17.4s, wzr\n"
971         "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
972         "dup v18.4s, wzr\n"
973         "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
974         "dup v19.4s, wzr\n"
975         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
976         "dup v20.4s, wzr\n"
977         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
978         "dup v21.4s, wzr\n"
979         "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
980         "dup v22.4s, wzr\n"
981         "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
982         "dup v23.4s, wzr\n"
983         "dup v24.4s, wzr\n"
984         "dup v25.4s, wzr\n"
985         "dup v26.4s, wzr\n"
986         "dup v27.4s, wzr\n"
987         "dup v28.4s, wzr\n"
988         "dup v29.4s, wzr\n"
989         "dup v30.4s, wzr\n"
990         "dup v31.4s, wzr\n"
991 
992         // Multiply dst_col_stride by 4 == sizeof(int32) to use
993         // it as a byte offset below.
994         "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
995 
996         // Initial arithmetic of the first loop iteration,
997         // taken out of the loop so that in the loop itself we have
998         // optimal streaming of data from memory.
999         "smull    v8.8h,  v0.8b,  v4.8b\n"
1000         "smull    v9.8h,  v1.8b,  v4.8b\n"
1001         "smull    v10.8h,  v2.8b,  v4.8b\n"
1002         "smull    v11.8h,  v3.8b,  v4.8b\n"
1003         "smull    v12.8h,  v0.8b,  v5.8b\n"
1004         "smull    v13.8h,  v1.8b,  v5.8b\n"
1005         "smull    v14.8h,  v2.8b,  v5.8b\n"
1006         "smull    v15.8h,  v3.8b,  v5.8b\n"
1007 
1008         // Multiply-accumulate second-half, again into the same
1009         // 16bit local accumulator registers. This is where we
1010         // take advantage of having int8 instead of uint8 and therefore
1011         // being able to accumulate two products into int16.
1012         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1013         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1014         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1015         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1016         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
1017         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
1018         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
1019         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
1020 
1021         "subs %[run_depth], %[run_depth], #16\n"
1022 
1023         // If the loop depth is only 16, then we can skip the general loop
1024         // and go straight to the final part of the code.
1025         "beq " GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "f\n"
1026 
1027         // General loop.
1028         GEMMLOWP_LABEL_LOOP
1029         ":\n"
1030 
1031         // Overview of register layout:
1032         //
1033         // A 4x16 block of Rhs is stored in 8 bit in v0--v3.
1034         // A 4x16 block of Lhs is stored in 8 bit in v4--v7.
1035         //
1036         // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit
1037         // components which need to be horizontally-added at the end)
1038         //
1039         // The Lhs vectors are multiplied by the Rhs vectors with a widening
1040         // multiply over the 8 first levels of depth, producing int16x8
1041         // vectors of products for each position in the accumulator matrix.
1042         // Here comes the special trick: since the operands are signed int8,
1043         // their range being [ -2^7 , 2^7 ), their products are in range
1044         // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
1045         // without any risk of overflowing int16.
1046         // We thus proceed with the 8 next levels of depth, multiplying
1047         // again Lhs by Rhs, accumulating into this existing int16x8 vector.
1048         //
1049         // Only then, having processed 16 levels of depth, do we need to
1050         // horizontally add these int16x8 accumulators into the final
1051         // int32x4 accumulators.
1052         //
1053         // As we do not have enough registers to store all 16 int16x8
1054         // temporary-16bit-accumulators, we have them cycle through v8--v15.
1055         //
1056         //
1057         // Register layout (ignoring the v8--v15 temporary 16bit accumulators):
1058         //
1059         //                               +--------+--------+--------+--------+
1060         //                               |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
1061         //                          Rhs  +--------+--------+--------+--------+
1062         //                               |  ...   |  ...   |  ...   |  ...   |
1063         //                               +--------+--------+--------+--------|
1064         //                               |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
1065         //                               +--------+--------+--------+--------+
1066         //
1067         //                               |        |        |        |        |
1068         //
1069         //    Lhs                        |        |        |        |        |
1070         //
1071         //  +-------+-----+--------+ - - +--------+--------+--------+--------+
1072         //  |v4.b[0]| ... |v4.b[15]|     | v16.4s | v17.4s | v18.4s | v19.4s |
1073         //  |v5.b[0]| ... |v5.b[15]|     | v20.4s | v21.4s | v22.4s | v23.4s |
1074         //  |v6.b[0]| ... |v6.b[15]|     | v24.4s | v25.4s | v26.4s | v27.4s |
1075         //  |v7.b[0]| ... |v7.b[15]|     | v28.4s | v29.4s | v30.4s | v31.4s |
1076         //  +-------+--------------+ - - +--------+--------+--------+--------+
1077         //
1078         //                                                Accumulator
1079         //
1080 
1081         // Some multiplications and 16-bit accumulation were already done above,
1082         // so we start right away in the middle.
1083         "sadalp  v16.4s, v8.8h\n"
1084         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
1085         "smull    v8.8h,  v0.8b,  v6.8b\n"
1086         "sadalp  v17.4s, v9.8h\n"
1087         "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
1088         "smull    v9.8h,  v1.8b,  v6.8b\n"
1089         "sadalp  v18.4s, v10.8h\n"
1090         "smull    v10.8h,  v2.8b,  v6.8b\n"
1091         "sadalp  v19.4s, v11.8h\n"
1092         "smull    v11.8h,  v3.8b,  v6.8b\n"
1093         "sadalp  v20.4s, v12.8h\n"
1094         "smull    v12.8h,  v0.8b,  v7.8b\n"
1095         "sadalp  v21.4s, v13.8h\n"
1096         "smull    v13.8h,  v1.8b,  v7.8b\n"
1097         "sadalp  v22.4s, v14.8h\n"
1098         "smull    v14.8h,  v2.8b,  v7.8b\n"
1099         "sadalp  v23.4s, v15.8h\n"
1100         "smull    v15.8h,  v3.8b,  v7.8b\n"
1101 
1102         // Multiply-accumulate second-half, again into the same
1103         // 16bit local accumulator registers. This is where we
1104         // take advantage of having int8 instead of uint8 and therefore
1105         // being able to accumulate two products into int16.
1106         "smlal2   v8.8h,  v0.16b,  v6.16b\n"
1107         "smlal2   v9.8h,  v1.16b,  v6.16b\n"
1108         "smlal2   v10.8h,  v2.16b,  v6.16b\n"
1109         "smlal2   v11.8h,  v3.16b,  v6.16b\n"
1110 
1111         "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
1112 
1113         "smlal2   v12.8h,  v0.16b,  v7.16b\n"
1114         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
1115         "smlal2   v13.8h,  v1.16b,  v7.16b\n"
1116         "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
1117         "smlal2   v14.8h,  v2.16b,  v7.16b\n"
1118         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
1119         "smlal2   v15.8h,  v3.16b,  v7.16b\n"
1120         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
1121 
1122         "sadalp  v24.4s, v8.8h\n"
1123         "smull    v8.8h,  v0.8b,  v4.8b\n"
1124         "sadalp  v25.4s, v9.8h\n"
1125         "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
1126         "smull    v9.8h,  v1.8b,  v4.8b\n"
1127         "sadalp  v26.4s, v10.8h\n"
1128         "smull    v10.8h,  v2.8b,  v4.8b\n"
1129         "sadalp  v27.4s, v11.8h\n"
1130         "smull    v11.8h,  v3.8b,  v4.8b\n"
1131         "sadalp  v28.4s, v12.8h\n"
1132         "smull    v12.8h,  v0.8b,  v5.8b\n"
1133         "sadalp  v29.4s, v13.8h\n"
1134         "smull    v13.8h,  v1.8b,  v5.8b\n"
1135         "sadalp  v30.4s, v14.8h\n"
1136         "smull    v14.8h,  v2.8b,  v5.8b\n"
1137         "sadalp  v31.4s, v15.8h\n"
1138         "smull    v15.8h,  v3.8b,  v5.8b\n"
1139 
1140         // Multiply-accumulate second-half, again into the same
1141         // 16bit local accumulator registers. This is where we
1142         // take advantage of having int8 instead of uint8 and therefore
1143         // being able to accumulate two products into int16.
1144         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1145         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1146         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1147         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1148 
1149         // Loop. Decrement loop index (depth) by 16, since we just handled
1150         // 16 levels of depth.  Do this subs a bit before the end of the loop
1151         // for better dispatch on A57.
1152         "subs %[run_depth], %[run_depth], #16\n"
1153 
1154         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
1155         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
1156         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
1157         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
1158 
1159         "bne " GEMMLOWP_LABEL_LOOP "b\n"
1160 
1161         // Final code for the last 16 levels of depth.
1162         // There is nothing to load anymore, only some arithmetic to finish.
1163         GEMMLOWP_LABEL_AFTER_LOOP_LAST16
1164         ":\n"
1165 
1166         // Some multiplications and 16-bit accumulation were already done above,
1167         // so we start right away in the middle.
1168         "sadalp  v16.4s, v8.8h\n"
1169         "smull    v8.8h,  v0.8b,  v6.8b\n"
1170         "sadalp  v17.4s, v9.8h\n"
1171         "smull    v9.8h,  v1.8b,  v6.8b\n"
1172         "sadalp  v18.4s, v10.8h\n"
1173         "smull    v10.8h,  v2.8b,  v6.8b\n"
1174         "sadalp  v19.4s, v11.8h\n"
1175         "smull    v11.8h,  v3.8b,  v6.8b\n"
1176         "sadalp  v20.4s, v12.8h\n"
1177         "smull    v12.8h,  v0.8b,  v7.8b\n"
1178         "sadalp  v21.4s, v13.8h\n"
1179         "smull    v13.8h,  v1.8b,  v7.8b\n"
1180         "sadalp  v22.4s, v14.8h\n"
1181         "smull    v14.8h,  v2.8b,  v7.8b\n"
1182         "sadalp  v23.4s, v15.8h\n"
1183         "smull    v15.8h,  v3.8b,  v7.8b\n"
1184 
1185         // Multiply-accumulate second-half, again into the same
1186         // 16bit local accumulator registers. This is where we
1187         // take advantage of having int8 instead of uint8 and therefore
1188         // being able to accumulate two products into int16.
1189         "smlal2   v8.8h,  v0.16b,  v6.16b\n"
1190         "smlal2   v9.8h,  v1.16b,  v6.16b\n"
1191         "smlal2   v10.8h,  v2.16b,  v6.16b\n"
1192         "smlal2   v11.8h,  v3.16b,  v6.16b\n"
1193         "smlal2   v12.8h,  v0.16b,  v7.16b\n"
1194         "smlal2   v13.8h,  v1.16b,  v7.16b\n"
1195         "smlal2   v14.8h,  v2.16b,  v7.16b\n"
1196         "smlal2   v15.8h,  v3.16b,  v7.16b\n"
1197 
1198         "sadalp  v24.4s, v8.8h\n"
1199         "sadalp  v25.4s, v9.8h\n"
1200         "sadalp  v26.4s, v10.8h\n"
1201         "sadalp  v27.4s, v11.8h\n"
1202         "sadalp  v28.4s, v12.8h\n"
1203         "sadalp  v29.4s, v13.8h\n"
1204         "sadalp  v30.4s, v14.8h\n"
1205         "sadalp  v31.4s, v15.8h\n"
1206 
1207         // Reduce 32bit accumulators horizontally.
1208         "addp v0.4s, v16.4s, v20.4s\n"
1209         "addp v2.4s, v17.4s, v21.4s\n"
1210         "addp v4.4s, v18.4s, v22.4s\n"
1211         "addp v6.4s, v19.4s, v23.4s\n"
1212         "addp v1.4s, v24.4s, v28.4s\n"
1213         "addp v3.4s, v25.4s, v29.4s\n"
1214         "addp v5.4s, v26.4s, v30.4s\n"
1215         "addp v7.4s, v27.4s, v31.4s\n"
1216 
1217         "cmp %[start_depth], #0\n"
1218         "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
1219         "f\n"
1220 
1221         // Reduce 32bit accumulators horizontally, second pass
1222         // (each pass adds pairwise. we need to add 4-wise).
1223         "addp v12.4s, v0.4s, v1.4s\n"
1224         "addp v13.4s, v2.4s, v3.4s\n"
1225         "addp v14.4s, v4.4s, v5.4s\n"
1226         "addp v15.4s, v6.4s, v7.4s\n"
1227 
1228         "b " GEMMLOWP_LABEL_STORE "f\n"
1229 
1230         GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
1231         ":\n"
1232 
1233         // Reduce 32bit accumulators horizontally, second pass
1234         // (each pass adds pairwise. we need to add 4-wise),
1235         // and load destination values from memory.
1236         "mov x0, %[dst_ptr]\n"
1237         "ld1 {v12.16b}, [x0], %[dst_col_stride]\n"
1238         "addp v8.4s, v0.4s, v1.4s\n"
1239         "ld1 {v13.16b}, [x0], %[dst_col_stride]\n"
1240         "addp v9.4s, v2.4s, v3.4s\n"
1241         "ld1 {v14.16b}, [x0], %[dst_col_stride]\n"
1242         "addp v10.4s, v4.4s, v5.4s\n"
1243         "ld1 {v15.16b}, [x0]\n"
1244         "addp v11.4s, v6.4s, v7.4s\n"
1245 
1246         // Add horizontally-reduced accumulators into
1247         // the values loaded from memory
1248         "add v12.4s, v12.4s, v8.4s\n"
1249         "add v13.4s, v13.4s, v9.4s\n"
1250         "add v14.4s, v14.4s, v10.4s\n"
1251         "add v15.4s, v15.4s, v11.4s\n"
1252 
1253         GEMMLOWP_LABEL_STORE
1254         ":\n"
1255         // Store back into memory
1256         "mov x0, %[dst_ptr]\n"
1257         "st1 {v12.16b}, [x0], %[dst_col_stride]\n"
1258         "st1 {v13.16b}, [x0], %[dst_col_stride]\n"
1259         "st1 {v14.16b}, [x0], %[dst_col_stride]\n"
1260         "st1 {v15.16b}, [x0]\n"
1261         :  // outputs
1262         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1263         [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth),
1264         [dst_col_stride] "+r"(dst_col_stride)
1265         :  // inputs
1266         [start_depth] "r"(start_depth)
1267         :  // clobbers
1268         "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1269         "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
1270         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1271         "v28", "v29", "v30", "v31");
1272 #undef GEMMLOWP_LABEL_LOOP
1273 #undef GEMMLOWP_LABEL_AFTER_LOOP_LAST16
1274 #undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
1275 #undef GEMMLOWP_LABEL_STORE
1276   }
1277 };
1278 
1279 // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
1280 // requires that user inputs were originally int8. This avoids the uint8->int8
1281 // conversion in the pack step.
1282 struct NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs
1283     : NEON_64bit_GEMM_Int8Operands_LhsNonzero {
1284   typedef KernelFormat<
1285       KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
1286       KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
1287       Format;
1288 };
1289 
1290 // Our main GEMM kernel.
1291 struct NEON_64_Kernel12x8Depth2 : KernelBase {
1292   typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
1293                        KernelSideFormat<CellFormat<4, 2>, 2> >
1294       Format;
1295 
NameNEON_64_Kernel12x8Depth21296   const char* Name() const override { return "NEON, 12x8, depth 2"; }
1297 
1298   // TODO(benoitjacob): reorder function arguments so dst comes last
RunNEON_64_Kernel12x8Depth21299   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
1300            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
1301            const std::uint8_t* rhs_ptr, std::size_t start_depth,
1302            std::size_t run_depth) const override {
1303     (void)dst_row_stride;
1304     ScopedProfilingLabel label("optimized kernel (NEON 12x8)");
1305 // See comments above for why we need local numerical labels in our asm.
1306 #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
1307 #define GEMMLOWP_LABEL_BEFORE_LOOP "2"
1308 #define GEMMLOWP_LABEL_LOOP "3"
1309 #define GEMMLOWP_LABEL_AFTER_LOOP "4"
1310 
1311     assert(dst_row_stride == 1);
1312     asm volatile(
1313         // Load 1 Rhs cell of size 2x8
1314         "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
1315         "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
1316 
1317         // Load 3 Lhs cells of size 4x2 each
1318         "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
1319         "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
1320         "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
1321 
1322         // Multiply dst_col_stride by 4 == sizeof(int32) to use
1323         // it as a byte offset below.
1324         "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
1325 
1326         "cmp %[start_depth], #0\n"
1327         "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
1328         "f\n"
1329 
1330         // Load accumulators
1331         "mov x1, %[dst_ptr]\n"
1332         "mov x0, x1\n"
1333         "ld1 {v8.16b}, [x0], #16\n"
1334         "subs %[run_depth], %[run_depth], #2\n"
1335         "ld1 {v16.16b}, [x0], #16\n"
1336         "add x1, x1, %[dst_col_stride]\n"
1337         "ld1 {v24.16b}, [x0]\n"
1338         "mov x0, x1\n"
1339         "ld1 {v9.16b}, [x0], #16\n"
1340         "add x1, x1, %[dst_col_stride]\n"
1341         "ld1 {v17.16b}, [x0], #16\n"
1342         "ld1 {v25.16b}, [x0]\n"
1343         "mov x0, x1\n"
1344         "ld1 {v10.16b}, [x0], #16\n"
1345         "add x1, x1, %[dst_col_stride]\n"
1346         "ld1 {v18.16b}, [x0], #16\n"
1347         "ld1 {v26.16b}, [x0]\n"
1348         "mov x0, x1\n"
1349         "ld1 {v11.16b}, [x0], #16\n"
1350         "add x1, x1, %[dst_col_stride]\n"
1351         "ld1 {v19.16b}, [x0], #16\n"
1352         "ld1 {v27.16b}, [x0]\n"
1353         "mov x0, x1\n"
1354         "ld1 {v12.16b}, [x0], #16\n"
1355         "add x1, x1, %[dst_col_stride]\n"
1356         "ld1 {v20.16b}, [x0], #16\n"
1357         "ld1 {v28.16b}, [x0]\n"
1358         "mov x0, x1\n"
1359         "ld1 {v13.16b}, [x0], #16\n"
1360         "add x1, x1, %[dst_col_stride]\n"
1361         "ld1 {v21.16b}, [x0], #16\n"
1362         "ld1 {v29.16b}, [x0]\n"
1363         "mov x0, x1\n"
1364         "ld1 {v14.16b}, [x0], #16\n"
1365         "add x1, x1, %[dst_col_stride]\n"
1366         "ld1 {v22.16b}, [x0], #16\n"
1367         "ld1 {v30.16b}, [x0]\n"
1368         "mov x0, x1\n"
1369         "ld1 {v15.16b}, [x0], #16\n"
1370         "ld1 {v23.16b}, [x0], #16\n"
1371         "ld1 {v31.16b}, [x0]\n"
1372 
1373         "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
1374 
1375         GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
1376         ":\n"
1377 
1378         // Clear accumulator registers (see layout below)
1379         "dup v8.4s, wzr\n"
1380         "subs %[run_depth], %[run_depth], #2\n"
1381         "dup v9.4s, wzr\n"
1382         "dup v10.4s, wzr\n"
1383         "dup v11.4s, wzr\n"
1384         "dup v12.4s, wzr\n"
1385         "dup v13.4s, wzr\n"
1386         "dup v14.4s, wzr\n"
1387         "dup v15.4s, wzr\n"
1388         "dup v16.4s, wzr\n"
1389         "dup v17.4s, wzr\n"
1390         "dup v18.4s, wzr\n"
1391         "dup v19.4s, wzr\n"
1392         "dup v20.4s, wzr\n"
1393         "dup v21.4s, wzr\n"
1394         "dup v22.4s, wzr\n"
1395         "dup v23.4s, wzr\n"
1396         "dup v24.4s, wzr\n"
1397         "dup v25.4s, wzr\n"
1398         "dup v26.4s, wzr\n"
1399         "dup v27.4s, wzr\n"
1400         "dup v28.4s, wzr\n"
1401         "dup v29.4s, wzr\n"
1402         "dup v30.4s, wzr\n"
1403         "dup v31.4s, wzr\n"
1404 
1405         GEMMLOWP_LABEL_BEFORE_LOOP
1406         ":\n"
1407 
1408         "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
1409 
1410         GEMMLOWP_LABEL_LOOP
1411         ":\n"
1412 
1413         // Overview of register layout:
1414         //
1415         // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1.
1416         // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4.
1417         // A 12x8 block of accumulators is stored in 32bit in v8--v31.
1418         //
1419         //                         +--------+--------+-----+--------+--------+
1420         //                         |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] |
1421         //                    Rhs  +--------+--------+-----+--------+--------+
1422         //                         |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] |
1423         //                         +--------+--------+-----+--------+--------+
1424         //
1425         //                         |        |        |     |        |        |
1426         //
1427         //    Lhs                  |        |        |     |        |        |
1428         //
1429         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1430         //  |v2.h[0]|v2.h[4]|      |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]|
1431         //  |v2.h[1]|v2.h[5]|      |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]|
1432         //  |v2.h[2]|v2.h[6]|      |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]|
1433         //  |v2.h[3]|v2.h[7]|      |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]|
1434         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1435         //  |v3.h[0]|v3.h[4]|      |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]|
1436         //  |v3.h[1]|v3.h[5]|      |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]|
1437         //  |v3.h[2]|v3.h[6]|      |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]|
1438         //  |v3.h[3]|v3.h[7]|      |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]|
1439         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1440         //  |v4.h[0]|v4.h[4]|      |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]|
1441         //  |v4.h[1]|v4.h[5]|      |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]|
1442         //  |v4.h[2]|v4.h[6]|      |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]|
1443         //  |v4.h[3]|v4.h[7]|      |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]|
1444         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
1445         //
1446         //                            Accumulator
1447 
1448         // Expand Lhs/Rhs cells to 16 bit.
1449         "uxtl v0.8h, v5.8b\n"
1450         "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
1451         "uxtl v1.8h, v6.8b\n"
1452         "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
1453         "uxtl v2.8h, v2.8b\n"
1454         "uxtl v3.8h, v3.8b\n"
1455         "uxtl v4.8h, v4.8b\n"
1456 
1457         // Multiply-accumulate, top third
1458         "umlal v8.4s, v2.4h, v0.h[0]\n"
1459         "umlal v9.4s, v2.4h, v0.h[1]\n"
1460         "umlal v10.4s, v2.4h, v0.h[2]\n"
1461         "umlal v11.4s, v2.4h, v0.h[3]\n"
1462         "umlal v12.4s, v2.4h, v1.h[0]\n"
1463         "umlal v13.4s, v2.4h, v1.h[1]\n"
1464         "umlal v14.4s, v2.4h, v1.h[2]\n"
1465         "umlal v15.4s, v2.4h, v1.h[3]\n"
1466         "umlal2 v8.4s, v2.8h, v0.h[4]\n"
1467         "umlal2 v9.4s, v2.8h, v0.h[5]\n"
1468         "umlal2 v10.4s, v2.8h, v0.h[6]\n"
1469         "umlal2 v11.4s, v2.8h, v0.h[7]\n"
1470         "umlal2 v12.4s, v2.8h, v1.h[4]\n"
1471         "umlal2 v13.4s, v2.8h, v1.h[5]\n"
1472         "umlal2 v14.4s, v2.8h, v1.h[6]\n"
1473         "umlal2 v15.4s, v2.8h, v1.h[7]\n"
1474         "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
1475 
1476         // Multiply-accumulate, middle third
1477         "umlal v16.4s, v3.4h, v0.h[0]\n"
1478         "umlal v17.4s, v3.4h, v0.h[1]\n"
1479         "umlal v18.4s, v3.4h, v0.h[2]\n"
1480         "umlal v19.4s, v3.4h, v0.h[3]\n"
1481         "umlal v20.4s, v3.4h, v1.h[0]\n"
1482         "umlal v21.4s, v3.4h, v1.h[1]\n"
1483         "umlal v22.4s, v3.4h, v1.h[2]\n"
1484         "umlal v23.4s, v3.4h, v1.h[3]\n"
1485         "umlal2 v16.4s, v3.8h, v0.h[4]\n"
1486         "umlal2 v17.4s, v3.8h, v0.h[5]\n"
1487         "umlal2 v18.4s, v3.8h, v0.h[6]\n"
1488         "umlal2 v19.4s, v3.8h, v0.h[7]\n"
1489         "umlal2 v20.4s, v3.8h, v1.h[4]\n"
1490         "umlal2 v21.4s, v3.8h, v1.h[5]\n"
1491         "umlal2 v22.4s, v3.8h, v1.h[6]\n"
1492         "umlal2 v23.4s, v3.8h, v1.h[7]\n"
1493         "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
1494 
1495         "subs %[run_depth], %[run_depth], #2\n"
1496 
1497         // Multiply-accumulate, bottom third
1498         "umlal v24.4s, v4.4h, v0.h[0]\n"
1499         "umlal v25.4s, v4.4h, v0.h[1]\n"
1500         "umlal v26.4s, v4.4h, v0.h[2]\n"
1501         "umlal v27.4s, v4.4h, v0.h[3]\n"
1502         "umlal v28.4s, v4.4h, v1.h[0]\n"
1503         "umlal v29.4s, v4.4h, v1.h[1]\n"
1504         "umlal v30.4s, v4.4h, v1.h[2]\n"
1505         "umlal v31.4s, v4.4h, v1.h[3]\n"
1506         "umlal2 v24.4s, v4.8h, v0.h[4]\n"
1507         "umlal2 v25.4s, v4.8h, v0.h[5]\n"
1508         "umlal2 v26.4s, v4.8h, v0.h[6]\n"
1509         "umlal2 v27.4s, v4.8h, v0.h[7]\n"
1510         "umlal2 v28.4s, v4.8h, v1.h[4]\n"
1511         "umlal2 v29.4s, v4.8h, v1.h[5]\n"
1512         "umlal2 v30.4s, v4.8h, v1.h[6]\n"
1513         "umlal2 v31.4s, v4.8h, v1.h[7]\n"
1514         "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
1515 
1516         "bne " GEMMLOWP_LABEL_LOOP "b\n"
1517 
1518         GEMMLOWP_LABEL_AFTER_LOOP
1519         ":\n"
1520 
1521         // Expand Lhs/Rhs cells to 16 bit.
1522         "uxtl v0.8h, v5.8b\n"
1523         "uxtl v1.8h, v6.8b\n"
1524         "uxtl v2.8h, v2.8b\n"
1525         "uxtl v3.8h, v3.8b\n"
1526         "uxtl v4.8h, v4.8b\n"
1527 
1528         // Multiply-accumulate, level of depth 0
1529         "umlal v8.4s, v2.4h, v0.h[0]\n"
1530         "umlal v9.4s, v2.4h, v0.h[1]\n"
1531         "umlal v10.4s, v2.4h, v0.h[2]\n"
1532         "umlal v11.4s, v2.4h, v0.h[3]\n"
1533         "umlal v12.4s, v2.4h, v1.h[0]\n"
1534         "umlal v13.4s, v2.4h, v1.h[1]\n"
1535         "umlal v14.4s, v2.4h, v1.h[2]\n"
1536         "umlal v15.4s, v2.4h, v1.h[3]\n"
1537         "umlal v16.4s, v3.4h, v0.h[0]\n"
1538         "umlal v17.4s, v3.4h, v0.h[1]\n"
1539         "umlal v18.4s, v3.4h, v0.h[2]\n"
1540         "umlal v19.4s, v3.4h, v0.h[3]\n"
1541         "umlal v20.4s, v3.4h, v1.h[0]\n"
1542         "umlal v21.4s, v3.4h, v1.h[1]\n"
1543         "umlal v22.4s, v3.4h, v1.h[2]\n"
1544         "umlal v23.4s, v3.4h, v1.h[3]\n"
1545         "umlal v24.4s, v4.4h, v0.h[0]\n"
1546         "umlal v25.4s, v4.4h, v0.h[1]\n"
1547         "umlal v26.4s, v4.4h, v0.h[2]\n"
1548         "umlal v27.4s, v4.4h, v0.h[3]\n"
1549         "umlal v28.4s, v4.4h, v1.h[0]\n"
1550         "umlal v29.4s, v4.4h, v1.h[1]\n"
1551         "umlal v30.4s, v4.4h, v1.h[2]\n"
1552         "umlal v31.4s, v4.4h, v1.h[3]\n"
1553 
1554         // Multiply-accumulate, level of depth 1
1555         "umlal2 v8.4s, v2.8h, v0.h[4]\n"
1556         "umlal2 v9.4s, v2.8h, v0.h[5]\n"
1557         "umlal2 v10.4s, v2.8h, v0.h[6]\n"
1558         "umlal2 v11.4s, v2.8h, v0.h[7]\n"
1559         "umlal2 v12.4s, v2.8h, v1.h[4]\n"
1560         "umlal2 v13.4s, v2.8h, v1.h[5]\n"
1561         "umlal2 v14.4s, v2.8h, v1.h[6]\n"
1562         "umlal2 v15.4s, v2.8h, v1.h[7]\n"
1563         "umlal2 v16.4s, v3.8h, v0.h[4]\n"
1564         "umlal2 v17.4s, v3.8h, v0.h[5]\n"
1565         "umlal2 v18.4s, v3.8h, v0.h[6]\n"
1566         "umlal2 v19.4s, v3.8h, v0.h[7]\n"
1567         "umlal2 v20.4s, v3.8h, v1.h[4]\n"
1568         "umlal2 v21.4s, v3.8h, v1.h[5]\n"
1569         "umlal2 v22.4s, v3.8h, v1.h[6]\n"
1570         "umlal2 v23.4s, v3.8h, v1.h[7]\n"
1571         "umlal2 v24.4s, v4.8h, v0.h[4]\n"
1572         "umlal2 v25.4s, v4.8h, v0.h[5]\n"
1573         "umlal2 v26.4s, v4.8h, v0.h[6]\n"
1574         "umlal2 v27.4s, v4.8h, v0.h[7]\n"
1575         "umlal2 v28.4s, v4.8h, v1.h[4]\n"
1576         "umlal2 v29.4s, v4.8h, v1.h[5]\n"
1577         "umlal2 v30.4s, v4.8h, v1.h[6]\n"
1578         "umlal2 v31.4s, v4.8h, v1.h[7]\n"
1579 
1580         // Store accumulators
1581         "mov x1, %[dst_ptr]\n"
1582         "mov x0, x1\n"
1583         "st1 {v8.16b}, [x0], #16\n"
1584         "subs %[run_depth], %[run_depth], #2\n"
1585         "st1 {v16.16b}, [x0], #16\n"
1586         "add x1, x1, %[dst_col_stride]\n"
1587         "st1 {v24.16b}, [x0]\n"
1588         "mov x0, x1\n"
1589         "st1 {v9.16b}, [x0], #16\n"
1590         "add x1, x1, %[dst_col_stride]\n"
1591         "st1 {v17.16b}, [x0], #16\n"
1592         "st1 {v25.16b}, [x0]\n"
1593         "mov x0, x1\n"
1594         "st1 {v10.16b}, [x0], #16\n"
1595         "add x1, x1, %[dst_col_stride]\n"
1596         "st1 {v18.16b}, [x0], #16\n"
1597         "st1 {v26.16b}, [x0]\n"
1598         "mov x0, x1\n"
1599         "st1 {v11.16b}, [x0], #16\n"
1600         "add x1, x1, %[dst_col_stride]\n"
1601         "st1 {v19.16b}, [x0], #16\n"
1602         "st1 {v27.16b}, [x0]\n"
1603         "mov x0, x1\n"
1604         "st1 {v12.16b}, [x0], #16\n"
1605         "add x1, x1, %[dst_col_stride]\n"
1606         "st1 {v20.16b}, [x0], #16\n"
1607         "st1 {v28.16b}, [x0]\n"
1608         "mov x0, x1\n"
1609         "st1 {v13.16b}, [x0], #16\n"
1610         "add x1, x1, %[dst_col_stride]\n"
1611         "st1 {v21.16b}, [x0], #16\n"
1612         "st1 {v29.16b}, [x0]\n"
1613         "mov x0, x1\n"
1614         "st1 {v14.16b}, [x0], #16\n"
1615         "add x1, x1, %[dst_col_stride]\n"
1616         "st1 {v22.16b}, [x0], #16\n"
1617         "st1 {v30.16b}, [x0]\n"
1618         "mov x0, x1\n"
1619         "st1 {v15.16b}, [x0], #16\n"
1620         "st1 {v23.16b}, [x0], #16\n"
1621         "st1 {v31.16b}, [x0]\n"
1622 #undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
1623 #undef GEMMLOWP_LABEL_BEFORE_LOOP
1624 #undef GEMMLOWP_LABEL_LOOP
1625 #undef GEMMLOWP_LABEL_AFTER_LOOP
1626         :  // outputs
1627         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1628         [dst_ptr] "+r"(dst_ptr),
1629         [run_depth] "+r"(run_depth)
1630         :  // inputs
1631         [start_depth] "r"(start_depth),
1632         [dst_col_stride] "r"(dst_col_stride)
1633         :  // clobbers
1634         "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1635         "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
1636         "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
1637         "v27", "v28", "v29", "v30", "v31");
1638   }
1639 };
1640 
1641 #ifdef GEMMLOWP_DOTPROD_KERNEL
1642 #ifndef __ARM_FEATURE_DOTPROD
1643 #error This kernel requires ARM dot-product instructions. Enable them by \
1644   adding '+dotprod' to a compiler flag, e.g. -march=armv8.2-a+dotprod . \
1645   Note that Clang up to version 7 fails to define the corresponding \
1646   preprocessor token __ARM_FEATURE_DOTPROD, so you will still have to define \
1647   it manually.
1648 #endif
1649 // Kernels utilizing the Armv8.2 Dot Product extension.
1650 //
1651 // The dot product instructions work by taking 4 consecutive 8-bit depth
1652 // values from each operand, multiplying the 4 pairs together and
1653 // accumulating all the results into the corresponding 32-bit accumulator
1654 // lane.  As such, the operation is identical to a 32-bit instruction (like
1655 // FMLA used in SGEMM), except that 4 depth values are processed at a time
1656 // instead of 1.
1657 
1658 // Thus, this first kernel is a carbon copy of
1659 // "NEON_64bit_GEMM_Float32_WithScalar_A57" (which should provide good
1660 // performance for most processors) below with the opcode (fmla -> udot) and
1661 // types (float32 -> uint8/uint32) changed.
1662 //
1663 // A signed version of this kernel could be produced by replacing "udot"
1664 // with "sdot" - performance should be identical to this udot kernel.
1665 struct NEON_64_Kernel12x8Depth4_dotprod : KernelBase {
1666   typedef KernelFormat<KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 3>,
1667                        KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 2> >
1668       Format;
1669 
NameNEON_64_Kernel12x8Depth4_dotprod1670   const char* Name() const override { return "NEON, 12x8, depth 4, dotprod"; }
1671 
RunNEON_64_Kernel12x8Depth4_dotprod1672   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride,
1673            const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, std::size_t start_depth,
1674            std::size_t depth) const override {
1675     (void)dst_row_stride;
1676     ScopedProfilingLabel label("optimized kernel (NEON 12x8, depth 4, dotprod)");
1677 // See comments above for why we need local numerical labels in our asm.
1678 #define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
1679 #define GEMMLOWP_LABEL_BEFORE_LOOP "2"
1680 #define GEMMLOWP_LABEL_LOOP "3"
1681 #define GEMMLOWP_LABEL_AFTER_LOOP "4"
1682 
1683     assert(dst_row_stride == 1);
1684     asm volatile(
1685         // Multiply dst_col_stride by 4 == sizeof(int32) to use
1686         // it as a byte offset below.
1687         "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
1688 
1689         "cmp %[start_depth], #0\n"
1690         "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "f\n"
1691 
1692         // Load accumulators
1693         "mov x1, %[dst_ptr]\n"
1694         "mov x0, x1\n"
1695         "ld1 {v8.16b}, [x0], #16\n"
1696         "ld1 {v16.16b}, [x0], #16\n"
1697         "add x1, x1, %[dst_col_stride]\n"
1698         "ld1 {v24.16b}, [x0]\n"
1699         "mov x0, x1\n"
1700         "ld1 {v9.16b}, [x0], #16\n"
1701         "add x1, x1, %[dst_col_stride]\n"
1702         "ld1 {v17.16b}, [x0], #16\n"
1703         "ld1 {v25.16b}, [x0]\n"
1704         "mov x0, x1\n"
1705         "ld1 {v10.16b}, [x0], #16\n"
1706         "add x1, x1, %[dst_col_stride]\n"
1707         "ld1 {v18.16b}, [x0], #16\n"
1708         "ld1 {v26.16b}, [x0]\n"
1709         "mov x0, x1\n"
1710         "ld1 {v11.16b}, [x0], #16\n"
1711         "add x1, x1, %[dst_col_stride]\n"
1712         "ld1 {v19.16b}, [x0], #16\n"
1713         "ld1 {v27.16b}, [x0]\n"
1714         "mov x0, x1\n"
1715         "ld1 {v12.16b}, [x0], #16\n"
1716         "add x1, x1, %[dst_col_stride]\n"
1717         "ld1 {v20.16b}, [x0], #16\n"
1718         "ld1 {v28.16b}, [x0]\n"
1719         "mov x0, x1\n"
1720         "ld1 {v13.16b}, [x0], #16\n"
1721         "add x1, x1, %[dst_col_stride]\n"
1722         "ld1 {v21.16b}, [x0], #16\n"
1723         "ld1 {v29.16b}, [x0]\n"
1724         "mov x0, x1\n"
1725         "ld1 {v14.16b}, [x0], #16\n"
1726         "add x1, x1, %[dst_col_stride]\n"
1727         "ld1 {v22.16b}, [x0], #16\n"
1728         "ld1 {v30.16b}, [x0]\n"
1729         "mov x0, x1\n"
1730         "ld1 {v15.16b}, [x0], #16\n"
1731         "ld1 {v23.16b}, [x0], #16\n"
1732         "ld1 {v31.16b}, [x0]\n"
1733 
1734         "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
1735 
1736         GEMMLOWP_LABEL_CLEAR_ACCUMULATORS ":\n"
1737 
1738         // Clear accumulator registers (see layout below)
1739         "dup v8.4s, wzr\n"
1740         "dup v9.4s, wzr\n"
1741         "dup v10.4s, wzr\n"
1742         "dup v11.4s, wzr\n"
1743         "dup v12.4s, wzr\n"
1744         "dup v13.4s, wzr\n"
1745         "dup v14.4s, wzr\n"
1746         "dup v15.4s, wzr\n"
1747         "dup v16.4s, wzr\n"
1748         "dup v17.4s, wzr\n"
1749         "dup v18.4s, wzr\n"
1750         "dup v19.4s, wzr\n"
1751         "dup v20.4s, wzr\n"
1752         "dup v21.4s, wzr\n"
1753         "dup v22.4s, wzr\n"
1754         "dup v23.4s, wzr\n"
1755         "dup v24.4s, wzr\n"
1756         "dup v25.4s, wzr\n"
1757         "dup v26.4s, wzr\n"
1758         "dup v27.4s, wzr\n"
1759         "dup v28.4s, wzr\n"
1760         "dup v29.4s, wzr\n"
1761         "dup v30.4s, wzr\n"
1762         "dup v31.4s, wzr\n"
1763 
1764         GEMMLOWP_LABEL_BEFORE_LOOP ":\n"
1765 
1766         "subs %w[depth], %w[depth], #4\n"
1767 
1768         // The start of the loop assumes first Rhs cell is already loaded, so
1769         // do it here for first iteration.
1770         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
1771 
1772         // And the same for the first Lhs cell.
1773         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
1774 
1775         "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
1776 
1777         GEMMLOWP_LABEL_LOOP ":\n"
1778 
1779         // Start the MACs at the head of the loop - 1st cell from each side
1780         // already loaded.
1781         ".word 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
1782         ".word 0x6fa0e049  // udot v9.4s, v2.16b, v0.4b[1]\n"
1783         "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
1784         ".word 0x6f80e84a  // udot v10.4s, v2.16b, v0.4b[2]\n"
1785         ".word 0x6fa0e84b  // udot v11.4s, v2.16b, v0.4b[3]\n"
1786         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
1787         ".word 0x6f81e04c  // udot v12.4s, v2.16b, v1.4b[0]\n"
1788         ".word 0x6fa1e04d  // udot v13.4s, v2.16b, v1.4b[1]\n"
1789         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
1790         ".word 0x6f81e84e  // udot v14.4s, v2.16b, v1.4b[2]\n"
1791         ".word 0x6fa1e84f  // udot v15.4s, v2.16b, v1.4b[3]\n"
1792         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"  // Done with first Lhs cell - load
1793         // for the next iteration early.
1794         ".word 0x6f80e070  // udot v16.4s, v3.16b, v0.4b[0]\n"
1795         ".word 0x6fa0e071  // udot v17.4s, v3.16b, v0.4b[1]\n"
1796         ".word 0x6f80e872  // udot v18.4s, v3.16b, v0.4b[2]\n"
1797         ".word 0x6fa0e873  // udot v19.4s, v3.16b, v0.4b[3]\n"
1798         ".word 0x6f81e074  // udot v20.4s, v3.16b, v1.4b[0]\n"
1799         ".word 0x6fa1e075  // udot v21.4s, v3.16b, v1.4b[1]\n"
1800         ".word 0x6f81e876  // udot v22.4s, v3.16b, v1.4b[2]\n"
1801         ".word 0x6fa1e877  // udot v23.4s, v3.16b, v1.4b[3]\n"
1802         ".word 0x6f80e098  // udot v24.4s, v4.16b, v0.4b[0]\n"
1803         ".word 0x6fa0e099  // udot v25.4s, v4.16b, v0.4b[1]\n"
1804         ".word 0x6f80e89a  // udot v26.4s, v4.16b, v0.4b[2]\n"
1805         ".word 0x6fa0e89b  // udot v27.4s, v4.16b, v0.4b[3]\n"
1806         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"  // Done with the first Rhs cell -
1807         // load for the next iteration early.
1808         ".word 0x6f81e09c  // udot v28.4s, v4.16b, v1.4b[0]\n"
1809         ".word 0x6fa1e09d  // udot v29.4s, v4.16b, v1.4b[1]\n"
1810 
1811         // Loop.  Decrement loop index (depth) by 4 as udot processes 4
1812         // depth values.
1813         "subs %w[depth], %w[depth], #4\n"
1814         ".word 0x6f81e89e  // udot v30.4s, v4.16b, v1.4b[2]\n"
1815         ".word 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
1816 
1817         "bne " GEMMLOWP_LABEL_LOOP "b\n"
1818 
1819         GEMMLOWP_LABEL_AFTER_LOOP ":\n"
1820 
1821         // Final iteration. v0 and v2 were already loaded, don't load
1822         // them again, don't read past the end of buffers.
1823         ".word 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
1824         ".word 0x6fa0e049  // udot v9.4s, v2.16b, v0.4b[1]\n"
1825         "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
1826         ".word 0x6f80e84a  // udot v10.4s, v2.16b, v0.4b[2]\n"
1827         ".word 0x6fa0e84b  // udot v11.4s, v2.16b, v0.4b[3]\n"
1828         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
1829         ".word 0x6f81e04c  // udot v12.4s, v2.16b, v1.4b[0]\n"
1830         ".word 0x6fa1e04d  // udot v13.4s, v2.16b, v1.4b[1]\n"
1831         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
1832         ".word 0x6f81e84e  // udot v14.4s, v2.16b, v1.4b[2]\n"
1833         ".word 0x6fa1e84f  // udot v15.4s, v2.16b, v1.4b[3]\n"
1834         ".word 0x6f80e070  // udot v16.4s, v3.16b, v0.4b[0]\n"
1835         ".word 0x6fa0e071  // udot v17.4s, v3.16b, v0.4b[1]\n"
1836         ".word 0x6f80e872  // udot v18.4s, v3.16b, v0.4b[2]\n"
1837         ".word 0x6fa0e873  // udot v19.4s, v3.16b, v0.4b[3]\n"
1838         ".word 0x6f81e074  // udot v20.4s, v3.16b, v1.4b[0]\n"
1839         ".word 0x6fa1e075  // udot v21.4s, v3.16b, v1.4b[1]\n"
1840         ".word 0x6f81e876  // udot v22.4s, v3.16b, v1.4b[2]\n"
1841         ".word 0x6fa1e877  // udot v23.4s, v3.16b, v1.4b[3]\n"
1842         ".word 0x6f80e098  // udot v24.4s, v4.16b, v0.4b[0]\n"
1843         ".word 0x6fa0e099  // udot v25.4s, v4.16b, v0.4b[1]\n"
1844         ".word 0x6f80e89a  // udot v26.4s, v4.16b, v0.4b[2]\n"
1845         ".word 0x6fa0e89b  // udot v27.4s, v4.16b, v0.4b[3]\n"
1846         ".word 0x6f81e09c  // udot v28.4s, v4.16b, v1.4b[0]\n"
1847         ".word 0x6fa1e09d  // udot v29.4s, v4.16b, v1.4b[1]\n"
1848 
1849         // Loop.  Decrement loop index (depth) by 4 as udot processes 4
1850         // depth values.
1851         "subs %w[depth], %w[depth], #4\n"
1852         ".word 0x6f81e89e  // udot v30.4s, v4.16b, v1.4b[2]\n"
1853         ".word 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
1854 
1855         // Store accumulators
1856         "mov x1, %[dst_ptr]\n"
1857         "mov x0, x1\n"
1858         "st1 {v8.16b}, [x0], #16\n"
1859         "st1 {v16.16b}, [x0], #16\n"
1860         "add x1, x1, %[dst_col_stride]\n"
1861         "st1 {v24.16b}, [x0]\n"
1862         "mov x0, x1\n"
1863         "st1 {v9.16b}, [x0], #16\n"
1864         "add x1, x1, %[dst_col_stride]\n"
1865         "st1 {v17.16b}, [x0], #16\n"
1866         "st1 {v25.16b}, [x0]\n"
1867         "mov x0, x1\n"
1868         "st1 {v10.16b}, [x0], #16\n"
1869         "add x1, x1, %[dst_col_stride]\n"
1870         "st1 {v18.16b}, [x0], #16\n"
1871         "st1 {v26.16b}, [x0]\n"
1872         "mov x0, x1\n"
1873         "st1 {v11.16b}, [x0], #16\n"
1874         "add x1, x1, %[dst_col_stride]\n"
1875         "st1 {v19.16b}, [x0], #16\n"
1876         "st1 {v27.16b}, [x0]\n"
1877         "mov x0, x1\n"
1878         "st1 {v12.16b}, [x0], #16\n"
1879         "add x1, x1, %[dst_col_stride]\n"
1880         "st1 {v20.16b}, [x0], #16\n"
1881         "st1 {v28.16b}, [x0]\n"
1882         "mov x0, x1\n"
1883         "st1 {v13.16b}, [x0], #16\n"
1884         "add x1, x1, %[dst_col_stride]\n"
1885         "st1 {v21.16b}, [x0], #16\n"
1886         "st1 {v29.16b}, [x0]\n"
1887         "mov x0, x1\n"
1888         "st1 {v14.16b}, [x0], #16\n"
1889         "add x1, x1, %[dst_col_stride]\n"
1890         "st1 {v22.16b}, [x0], #16\n"
1891         "st1 {v30.16b}, [x0]\n"
1892         "mov x0, x1\n"
1893         "st1 {v15.16b}, [x0], #16\n"
1894         "st1 {v23.16b}, [x0], #16\n"
1895         "st1 {v31.16b}, [x0]\n"
1896         :  // outputs
1897         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1898         [depth] "+r"(depth)
1899         :  // inputs
1900         [dst_ptr] "r"(dst_ptr), [dst_col_stride] "r"(dst_col_stride), [start_depth] "r"(start_depth)
1901         :  // clobbers
1902         "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
1903         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
1904         "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
1905   }
1906 };
1907 #endif  // GEMMLOWP_DOTPROD_KERNEL
1908 
1909 #endif  // GEMMLOWP_NEON_64
1910 
1911 }  // namespace gemmlowp
1912 
1913 #endif  // GEMMLOWP_INTERNAL_KERNEL_NEON_H_
1914