Lines Matching refs:int32_t
28 void zip_1x8_aligned(const std::uint8_t* source, std::int32_t count, in zip_1x8_aligned()
29 std::int32_t stride, std::uint8_t* destination, in zip_1x8_aligned()
30 std::int32_t multiplicative_offset, in zip_1x8_aligned()
31 std::int32_t additive_offset) { in zip_1x8_aligned()
62 void zip_1x8_1_aligned(const std::uint8_t* source, std::int32_t count, in zip_1x8_1_aligned()
63 std::int32_t stride, std::uint8_t* destination, in zip_1x8_1_aligned()
64 std::int32_t multiplicative_offset, in zip_1x8_1_aligned()
65 std::int32_t additive_offset) { in zip_1x8_1_aligned()
103 void zip_1x8_2_aligned(const std::uint8_t* source, std::int32_t count, in zip_1x8_2_aligned()
104 std::int32_t stride, std::uint8_t* destination, in zip_1x8_2_aligned()
105 std::int32_t multiplicative_offset, in zip_1x8_2_aligned()
106 std::int32_t additive_offset) { in zip_1x8_2_aligned()
144 void zip_1x8_3_aligned(const std::uint8_t* source, std::int32_t count, in zip_1x8_3_aligned()
145 std::int32_t stride, std::uint8_t* destination, in zip_1x8_3_aligned()
146 std::int32_t multiplicative_offset, in zip_1x8_3_aligned()
147 std::int32_t additive_offset) { in zip_1x8_3_aligned()
186 void zip_1x8_4_aligned(const std::uint8_t* source, std::int32_t count, in zip_1x8_4_aligned()
187 std::int32_t stride, std::uint8_t* destination, in zip_1x8_4_aligned()
188 std::int32_t multiplicative_offset, in zip_1x8_4_aligned()
189 std::int32_t additive_offset) { in zip_1x8_4_aligned()
227 void zip_1x8_5_aligned(const std::uint8_t* source, std::int32_t count, in zip_1x8_5_aligned()
228 std::int32_t stride, std::uint8_t* destination, in zip_1x8_5_aligned()
229 std::int32_t multiplicative_offset, in zip_1x8_5_aligned()
230 std::int32_t additive_offset) { in zip_1x8_5_aligned()
269 void zip_1x8_6_aligned(const std::uint8_t* source, std::int32_t count, in zip_1x8_6_aligned()
270 std::int32_t stride, std::uint8_t* destination, in zip_1x8_6_aligned()
271 std::int32_t multiplicative_offset, in zip_1x8_6_aligned()
272 std::int32_t additive_offset) { in zip_1x8_6_aligned()
311 void zip_1x8_7_aligned(const std::uint8_t* source, std::int32_t count, in zip_1x8_7_aligned()
312 std::int32_t stride, std::uint8_t* destination, in zip_1x8_7_aligned()
313 std::int32_t multiplicative_offset, in zip_1x8_7_aligned()
314 std::int32_t additive_offset) { in zip_1x8_7_aligned()
354 void zip_2x8_aligned(const std::uint8_t* source, std::int32_t count, in zip_2x8_aligned()
355 std::int32_t stride, std::uint8_t* destination, in zip_2x8_aligned()
356 std::int32_t multiplicative_offset, in zip_2x8_aligned()
357 std::int32_t additive_offset) { in zip_2x8_aligned()
395 void zip_2x8_1_aligned(const std::uint8_t* source, std::int32_t count, in zip_2x8_1_aligned()
396 std::int32_t stride, std::uint8_t* destination, in zip_2x8_1_aligned()
397 std::int32_t multiplicative_offset, in zip_2x8_1_aligned()
398 std::int32_t additive_offset) { in zip_2x8_1_aligned()
446 void zip_2x8_2_aligned(const std::uint8_t* source, std::int32_t count, in zip_2x8_2_aligned()
447 std::int32_t stride, std::uint8_t* destination, in zip_2x8_2_aligned()
448 std::int32_t multiplicative_offset, in zip_2x8_2_aligned()
449 std::int32_t additive_offset) { in zip_2x8_2_aligned()
497 void zip_2x8_3_aligned(const std::uint8_t* source, std::int32_t count, in zip_2x8_3_aligned()
498 std::int32_t stride, std::uint8_t* destination, in zip_2x8_3_aligned()
499 std::int32_t multiplicative_offset, in zip_2x8_3_aligned()
500 std::int32_t additive_offset) { in zip_2x8_3_aligned()
550 void zip_2x8_4_aligned(const std::uint8_t* source, std::int32_t count, in zip_2x8_4_aligned()
551 std::int32_t stride, std::uint8_t* destination, in zip_2x8_4_aligned()
552 std::int32_t multiplicative_offset, in zip_2x8_4_aligned()
553 std::int32_t additive_offset) { in zip_2x8_4_aligned()
601 void zip_2x8_5_aligned(const std::uint8_t* source, std::int32_t count, in zip_2x8_5_aligned()
602 std::int32_t stride, std::uint8_t* destination, in zip_2x8_5_aligned()
603 std::int32_t multiplicative_offset, in zip_2x8_5_aligned()
604 std::int32_t additive_offset) { in zip_2x8_5_aligned()
654 void zip_2x8_6_aligned(const std::uint8_t* source, std::int32_t count, in zip_2x8_6_aligned()
655 std::int32_t stride, std::uint8_t* destination, in zip_2x8_6_aligned()
656 std::int32_t multiplicative_offset, in zip_2x8_6_aligned()
657 std::int32_t additive_offset) { in zip_2x8_6_aligned()
707 void zip_2x8_7_aligned(const std::uint8_t* source, std::int32_t count, in zip_2x8_7_aligned()
708 std::int32_t stride, std::uint8_t* destination, in zip_2x8_7_aligned()
709 std::int32_t multiplicative_offset, in zip_2x8_7_aligned()
710 std::int32_t additive_offset) { in zip_2x8_7_aligned()
762 void zip_3x8_aligned(const std::uint8_t* source, std::int32_t count, in zip_3x8_aligned()
763 std::int32_t stride, std::uint8_t* destination, in zip_3x8_aligned()
764 std::int32_t multiplicative_offset, in zip_3x8_aligned()
765 std::int32_t additive_offset) { in zip_3x8_aligned()
811 void zip_3x8_1_aligned(const std::uint8_t* source, std::int32_t count, in zip_3x8_1_aligned()
812 std::int32_t stride, std::uint8_t* destination, in zip_3x8_1_aligned()
813 std::int32_t multiplicative_offset, in zip_3x8_1_aligned()
814 std::int32_t additive_offset) { in zip_3x8_1_aligned()
873 void zip_3x8_2_aligned(const std::uint8_t* source, std::int32_t count, in zip_3x8_2_aligned()
874 std::int32_t stride, std::uint8_t* destination, in zip_3x8_2_aligned()
875 std::int32_t multiplicative_offset, in zip_3x8_2_aligned()
876 std::int32_t additive_offset) { in zip_3x8_2_aligned()
935 void zip_3x8_3_aligned(const std::uint8_t* source, std::int32_t count, in zip_3x8_3_aligned()
936 std::int32_t stride, std::uint8_t* destination, in zip_3x8_3_aligned()
937 std::int32_t multiplicative_offset, in zip_3x8_3_aligned()
938 std::int32_t additive_offset) { in zip_3x8_3_aligned()
1000 void zip_3x8_4_aligned(const std::uint8_t* source, std::int32_t count, in zip_3x8_4_aligned()
1001 std::int32_t stride, std::uint8_t* destination, in zip_3x8_4_aligned()
1002 std::int32_t multiplicative_offset, in zip_3x8_4_aligned()
1003 std::int32_t additive_offset) { in zip_3x8_4_aligned()
1062 void zip_3x8_5_aligned(const std::uint8_t* source, std::int32_t count, in zip_3x8_5_aligned()
1063 std::int32_t stride, std::uint8_t* destination, in zip_3x8_5_aligned()
1064 std::int32_t multiplicative_offset, in zip_3x8_5_aligned()
1065 std::int32_t additive_offset) { in zip_3x8_5_aligned()
1127 void zip_3x8_6_aligned(const std::uint8_t* source, std::int32_t count, in zip_3x8_6_aligned()
1128 std::int32_t stride, std::uint8_t* destination, in zip_3x8_6_aligned()
1129 std::int32_t multiplicative_offset, in zip_3x8_6_aligned()
1130 std::int32_t additive_offset) { in zip_3x8_6_aligned()
1192 void zip_3x8_7_aligned(const std::uint8_t* source, std::int32_t count, in zip_3x8_7_aligned()
1193 std::int32_t stride, std::uint8_t* destination, in zip_3x8_7_aligned()
1194 std::int32_t multiplicative_offset, in zip_3x8_7_aligned()
1195 std::int32_t additive_offset) { in zip_3x8_7_aligned()
1260 void zip_1x8(const std::uint8_t* source, std::int32_t count, in zip_1x8()
1261 std::int32_t stride, std::uint8_t* destination, in zip_1x8()
1262 std::int32_t multiplicative_offset, std::int32_t additive_offset) { in zip_1x8()
1293 void zip_1x8_1(const std::uint8_t* source, std::int32_t count, in zip_1x8_1()
1294 std::int32_t stride, std::uint8_t* destination, in zip_1x8_1()
1295 std::int32_t multiplicative_offset, in zip_1x8_1()
1296 std::int32_t additive_offset) { in zip_1x8_1()
1334 void zip_1x8_2(const std::uint8_t* source, std::int32_t count, in zip_1x8_2()
1335 std::int32_t stride, std::uint8_t* destination, in zip_1x8_2()
1336 std::int32_t multiplicative_offset, in zip_1x8_2()
1337 std::int32_t additive_offset) { in zip_1x8_2()
1375 void zip_1x8_3(const std::uint8_t* source, std::int32_t count, in zip_1x8_3()
1376 std::int32_t stride, std::uint8_t* destination, in zip_1x8_3()
1377 std::int32_t multiplicative_offset, in zip_1x8_3()
1378 std::int32_t additive_offset) { in zip_1x8_3()
1417 void zip_1x8_4(const std::uint8_t* source, std::int32_t count, in zip_1x8_4()
1418 std::int32_t stride, std::uint8_t* destination, in zip_1x8_4()
1419 std::int32_t multiplicative_offset, in zip_1x8_4()
1420 std::int32_t additive_offset) { in zip_1x8_4()
1458 void zip_1x8_5(const std::uint8_t* source, std::int32_t count, in zip_1x8_5()
1459 std::int32_t stride, std::uint8_t* destination, in zip_1x8_5()
1460 std::int32_t multiplicative_offset, in zip_1x8_5()
1461 std::int32_t additive_offset) { in zip_1x8_5()
1500 void zip_1x8_6(const std::uint8_t* source, std::int32_t count, in zip_1x8_6()
1501 std::int32_t stride, std::uint8_t* destination, in zip_1x8_6()
1502 std::int32_t multiplicative_offset, in zip_1x8_6()
1503 std::int32_t additive_offset) { in zip_1x8_6()
1542 void zip_1x8_7(const std::uint8_t* source, std::int32_t count, in zip_1x8_7()
1543 std::int32_t stride, std::uint8_t* destination, in zip_1x8_7()
1544 std::int32_t multiplicative_offset, in zip_1x8_7()
1545 std::int32_t additive_offset) { in zip_1x8_7()
1585 void zip_2x8(const std::uint8_t* source, std::int32_t count, in zip_2x8()
1586 std::int32_t stride, std::uint8_t* destination, in zip_2x8()
1587 std::int32_t multiplicative_offset, std::int32_t additive_offset) { in zip_2x8()
1625 void zip_2x8_1(const std::uint8_t* source, std::int32_t count, in zip_2x8_1()
1626 std::int32_t stride, std::uint8_t* destination, in zip_2x8_1()
1627 std::int32_t multiplicative_offset, in zip_2x8_1()
1628 std::int32_t additive_offset) { in zip_2x8_1()
1676 void zip_2x8_2(const std::uint8_t* source, std::int32_t count, in zip_2x8_2()
1677 std::int32_t stride, std::uint8_t* destination, in zip_2x8_2()
1678 std::int32_t multiplicative_offset, in zip_2x8_2()
1679 std::int32_t additive_offset) { in zip_2x8_2()
1727 void zip_2x8_3(const std::uint8_t* source, std::int32_t count, in zip_2x8_3()
1728 std::int32_t stride, std::uint8_t* destination, in zip_2x8_3()
1729 std::int32_t multiplicative_offset, in zip_2x8_3()
1730 std::int32_t additive_offset) { in zip_2x8_3()
1780 void zip_2x8_4(const std::uint8_t* source, std::int32_t count, in zip_2x8_4()
1781 std::int32_t stride, std::uint8_t* destination, in zip_2x8_4()
1782 std::int32_t multiplicative_offset, in zip_2x8_4()
1783 std::int32_t additive_offset) { in zip_2x8_4()
1831 void zip_2x8_5(const std::uint8_t* source, std::int32_t count, in zip_2x8_5()
1832 std::int32_t stride, std::uint8_t* destination, in zip_2x8_5()
1833 std::int32_t multiplicative_offset, in zip_2x8_5()
1834 std::int32_t additive_offset) { in zip_2x8_5()
1884 void zip_2x8_6(const std::uint8_t* source, std::int32_t count, in zip_2x8_6()
1885 std::int32_t stride, std::uint8_t* destination, in zip_2x8_6()
1886 std::int32_t multiplicative_offset, in zip_2x8_6()
1887 std::int32_t additive_offset) { in zip_2x8_6()
1937 void zip_2x8_7(const std::uint8_t* source, std::int32_t count, in zip_2x8_7()
1938 std::int32_t stride, std::uint8_t* destination, in zip_2x8_7()
1939 std::int32_t multiplicative_offset, in zip_2x8_7()
1940 std::int32_t additive_offset) { in zip_2x8_7()
1992 void zip_3x8(const std::uint8_t* source, std::int32_t count, in zip_3x8()
1993 std::int32_t stride, std::uint8_t* destination, in zip_3x8()
1994 std::int32_t multiplicative_offset, std::int32_t additive_offset) { in zip_3x8()
2040 void zip_3x8_1(const std::uint8_t* source, std::int32_t count, in zip_3x8_1()
2041 std::int32_t stride, std::uint8_t* destination, in zip_3x8_1()
2042 std::int32_t multiplicative_offset, in zip_3x8_1()
2043 std::int32_t additive_offset) { in zip_3x8_1()
2102 void zip_3x8_2(const std::uint8_t* source, std::int32_t count, in zip_3x8_2()
2103 std::int32_t stride, std::uint8_t* destination, in zip_3x8_2()
2104 std::int32_t multiplicative_offset, in zip_3x8_2()
2105 std::int32_t additive_offset) { in zip_3x8_2()
2164 void zip_3x8_3(const std::uint8_t* source, std::int32_t count, in zip_3x8_3()
2165 std::int32_t stride, std::uint8_t* destination, in zip_3x8_3()
2166 std::int32_t multiplicative_offset, in zip_3x8_3()
2167 std::int32_t additive_offset) { in zip_3x8_3()
2229 void zip_3x8_4(const std::uint8_t* source, std::int32_t count, in zip_3x8_4()
2230 std::int32_t stride, std::uint8_t* destination, in zip_3x8_4()
2231 std::int32_t multiplicative_offset, in zip_3x8_4()
2232 std::int32_t additive_offset) { in zip_3x8_4()
2291 void zip_3x8_5(const std::uint8_t* source, std::int32_t count, in zip_3x8_5()
2292 std::int32_t stride, std::uint8_t* destination, in zip_3x8_5()
2293 std::int32_t multiplicative_offset, in zip_3x8_5()
2294 std::int32_t additive_offset) { in zip_3x8_5()
2356 void zip_3x8_6(const std::uint8_t* source, std::int32_t count, in zip_3x8_6()
2357 std::int32_t stride, std::uint8_t* destination, in zip_3x8_6()
2358 std::int32_t multiplicative_offset, in zip_3x8_6()
2359 std::int32_t additive_offset) { in zip_3x8_6()
2421 void zip_3x8_7(const std::uint8_t* source, std::int32_t count, in zip_3x8_7()
2422 std::int32_t stride, std::uint8_t* destination, in zip_3x8_7()
2423 std::int32_t multiplicative_offset, in zip_3x8_7()
2424 std::int32_t additive_offset) { in zip_3x8_7()
2491 std::int32_t count, std::int32_t* result, in mul_1x8_1x8_int32_rhsadd()
2492 std::int32_t result_stride) { in mul_1x8_1x8_int32_rhsadd()
2536 std::int32_t count, std::int32_t* result, in mul_1x8_2x8_int32_rhsadd()
2537 std::int32_t result_stride) { in mul_1x8_2x8_int32_rhsadd()
2586 std::int32_t count, std::int32_t* result, in mul_1x8_3x8_int32_rhsadd()
2587 std::int32_t result_stride) { in mul_1x8_3x8_int32_rhsadd()
2646 std::int32_t count, std::int32_t* result, in mul_2x8_1x8_int32_rhsadd()
2647 std::int32_t result_stride) { in mul_2x8_1x8_int32_rhsadd()
2699 std::int32_t count, std::int32_t* result, in mul_2x8_2x8_int32_rhsadd()
2700 std::int32_t result_stride) { in mul_2x8_2x8_int32_rhsadd()
2761 std::int32_t count, std::int32_t* result, in mul_2x8_3x8_int32_rhsadd()
2762 std::int32_t result_stride) { in mul_2x8_3x8_int32_rhsadd()
2840 std::int32_t count, std::int32_t* result, in mul_3x8_1x8_int32_rhsadd()
2841 std::int32_t result_stride) { in mul_3x8_1x8_int32_rhsadd()
2900 std::int32_t count, std::int32_t* result, in mul_3x8_2x8_int32_rhsadd()
2901 std::int32_t result_stride) { in mul_3x8_2x8_int32_rhsadd()
2973 std::int32_t count, std::int32_t* result, in mul_3x8_3x8_int32_rhsadd()
2974 std::int32_t result_stride) { in mul_3x8_3x8_int32_rhsadd()
3071 std::int32_t count, in mul_1x8_1x8_int32_lhsadd_rhsadd()
3072 std::int32_t* result, in mul_1x8_1x8_int32_lhsadd_rhsadd()
3073 std::int32_t result_stride) { in mul_1x8_1x8_int32_lhsadd_rhsadd()
3122 std::int32_t count, in mul_1x8_2x8_int32_lhsadd_rhsadd()
3123 std::int32_t* result, in mul_1x8_2x8_int32_lhsadd_rhsadd()
3124 std::int32_t result_stride) { in mul_1x8_2x8_int32_lhsadd_rhsadd()
3178 std::int32_t count, in mul_1x8_3x8_int32_lhsadd_rhsadd()
3179 std::int32_t* result, in mul_1x8_3x8_int32_lhsadd_rhsadd()
3180 std::int32_t result_stride) { in mul_1x8_3x8_int32_lhsadd_rhsadd()
3244 std::int32_t count, in mul_2x8_1x8_int32_lhsadd_rhsadd()
3245 std::int32_t* result, in mul_2x8_1x8_int32_lhsadd_rhsadd()
3246 std::int32_t result_stride) { in mul_2x8_1x8_int32_lhsadd_rhsadd()
3305 std::int32_t count, in mul_2x8_2x8_int32_lhsadd_rhsadd()
3306 std::int32_t* result, in mul_2x8_2x8_int32_lhsadd_rhsadd()
3307 std::int32_t result_stride) { in mul_2x8_2x8_int32_lhsadd_rhsadd()
3375 std::int32_t count, in mul_2x8_3x8_int32_lhsadd_rhsadd()
3376 std::int32_t* result, in mul_2x8_3x8_int32_lhsadd_rhsadd()
3377 std::int32_t result_stride) { in mul_2x8_3x8_int32_lhsadd_rhsadd()
3463 std::int32_t count, in mul_3x8_1x8_int32_lhsadd_rhsadd()
3464 std::int32_t* result, in mul_3x8_1x8_int32_lhsadd_rhsadd()
3465 std::int32_t result_stride) { in mul_3x8_1x8_int32_lhsadd_rhsadd()
3533 std::int32_t count, in mul_3x8_2x8_int32_lhsadd_rhsadd()
3534 std::int32_t* result, in mul_3x8_2x8_int32_lhsadd_rhsadd()
3535 std::int32_t result_stride) { in mul_3x8_2x8_int32_lhsadd_rhsadd()
3617 std::int32_t count, in mul_3x8_3x8_int32_lhsadd_rhsadd()
3618 std::int32_t* result, in mul_3x8_3x8_int32_lhsadd_rhsadd()
3619 std::int32_t result_stride) { in mul_3x8_3x8_int32_lhsadd_rhsadd()
3725 std::int32_t count, float* result, in mul_1x8_1x8_float_lhsadd_rhsadd()
3726 std::int32_t result_stride, in mul_1x8_1x8_float_lhsadd_rhsadd()
3782 std::int32_t count, float* result, in mul_1x8_2x8_float_lhsadd_rhsadd()
3783 std::int32_t result_stride, in mul_1x8_2x8_float_lhsadd_rhsadd()
3844 std::int32_t count, float* result, in mul_1x8_3x8_float_lhsadd_rhsadd()
3845 std::int32_t result_stride, in mul_1x8_3x8_float_lhsadd_rhsadd()
3916 std::int32_t count, float* result, in mul_2x8_1x8_float_lhsadd_rhsadd()
3917 std::int32_t result_stride, in mul_2x8_1x8_float_lhsadd_rhsadd()
3985 std::int32_t count, float* result, in mul_2x8_2x8_float_lhsadd_rhsadd()
3986 std::int32_t result_stride, in mul_2x8_2x8_float_lhsadd_rhsadd()
4063 std::int32_t count, float* result, in mul_2x8_3x8_float_lhsadd_rhsadd()
4064 std::int32_t result_stride, in mul_2x8_3x8_float_lhsadd_rhsadd()
4159 std::int32_t count, float* result, in mul_3x8_1x8_float_lhsadd_rhsadd()
4160 std::int32_t result_stride, in mul_3x8_1x8_float_lhsadd_rhsadd()
4239 std::int32_t count, float* result, in mul_3x8_2x8_float_lhsadd_rhsadd()
4240 std::int32_t result_stride, in mul_3x8_2x8_float_lhsadd_rhsadd()
4333 std::int32_t count, float* result, in mul_3x8_3x8_float_lhsadd_rhsadd()
4334 std::int32_t result_stride, in mul_3x8_3x8_float_lhsadd_rhsadd()
4449 void qnt_1x8_aligned(const std::int32_t* source, std::int32_t count, in qnt_1x8_aligned()
4450 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_aligned()
4451 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8_aligned()
4452 std::int32_t multiplicative_offset, in qnt_1x8_aligned()
4453 std::int32_t rounding_offset, std::int32_t shift) { in qnt_1x8_aligned()
4489 void qnt_1x8_1_aligned(const std::int32_t* source, std::int32_t count, in qnt_1x8_1_aligned()
4490 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_1_aligned()
4492 std::int32_t destination_stride, in qnt_1x8_1_aligned()
4493 std::int32_t multiplicative_offset, in qnt_1x8_1_aligned()
4494 std::int32_t rounding_offset, std::int32_t shift) { in qnt_1x8_1_aligned()
4541 void qnt_1x8_2_aligned(const std::int32_t* source, std::int32_t count, in qnt_1x8_2_aligned()
4542 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_2_aligned()
4544 std::int32_t destination_stride, in qnt_1x8_2_aligned()
4545 std::int32_t multiplicative_offset, in qnt_1x8_2_aligned()
4546 std::int32_t rounding_offset, std::int32_t shift) { in qnt_1x8_2_aligned()
4593 void qnt_1x8_3_aligned(const std::int32_t* source, std::int32_t count, in qnt_1x8_3_aligned()
4594 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_3_aligned()
4596 std::int32_t destination_stride, in qnt_1x8_3_aligned()
4597 std::int32_t multiplicative_offset, in qnt_1x8_3_aligned()
4598 std::int32_t rounding_offset, std::int32_t shift) { in qnt_1x8_3_aligned()
4647 void qnt_1x8_4_aligned(const std::int32_t* source, std::int32_t count, in qnt_1x8_4_aligned()
4648 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_4_aligned()
4650 std::int32_t destination_stride, in qnt_1x8_4_aligned()
4651 std::int32_t multiplicative_offset, in qnt_1x8_4_aligned()
4652 std::int32_t rounding_offset, std::int32_t shift) { in qnt_1x8_4_aligned()
4699 void qnt_1x8_5_aligned(const std::int32_t* source, std::int32_t count, in qnt_1x8_5_aligned()
4700 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_5_aligned()
4702 std::int32_t destination_stride, in qnt_1x8_5_aligned()
4703 std::int32_t multiplicative_offset, in qnt_1x8_5_aligned()
4704 std::int32_t rounding_offset, std::int32_t shift) { in qnt_1x8_5_aligned()
4758 void qnt_1x8_6_aligned(const std::int32_t* source, std::int32_t count, in qnt_1x8_6_aligned()
4759 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_6_aligned()
4761 std::int32_t destination_stride, in qnt_1x8_6_aligned()
4762 std::int32_t multiplicative_offset, in qnt_1x8_6_aligned()
4763 std::int32_t rounding_offset, std::int32_t shift) { in qnt_1x8_6_aligned()
4816 void qnt_1x8_7_aligned(const std::int32_t* source, std::int32_t count, in qnt_1x8_7_aligned()
4817 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_7_aligned()
4819 std::int32_t destination_stride, in qnt_1x8_7_aligned()
4820 std::int32_t multiplicative_offset, in qnt_1x8_7_aligned()
4821 std::int32_t rounding_offset, std::int32_t shift) { in qnt_1x8_7_aligned()
4876 void qnt_2x8_aligned(const std::int32_t* source, std::int32_t count, in qnt_2x8_aligned()
4877 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_aligned()
4878 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8_aligned()
4879 std::int32_t multiplicative_offset, in qnt_2x8_aligned()
4880 std::int32_t rounding_offset, std::int32_t shift) { in qnt_2x8_aligned()
4934 void qnt_2x8_1_aligned(const std::int32_t* source, std::int32_t count, in qnt_2x8_1_aligned()
4935 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_1_aligned()
4937 std::int32_t destination_stride, in qnt_2x8_1_aligned()
4938 std::int32_t multiplicative_offset, in qnt_2x8_1_aligned()
4939 std::int32_t rounding_offset, std::int32_t shift) { in qnt_2x8_1_aligned()
5012 void qnt_2x8_2_aligned(const std::int32_t* source, std::int32_t count, in qnt_2x8_2_aligned()
5013 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_2_aligned()
5015 std::int32_t destination_stride, in qnt_2x8_2_aligned()
5016 std::int32_t multiplicative_offset, in qnt_2x8_2_aligned()
5017 std::int32_t rounding_offset, std::int32_t shift) { in qnt_2x8_2_aligned()
5090 void qnt_2x8_3_aligned(const std::int32_t* source, std::int32_t count, in qnt_2x8_3_aligned()
5091 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_3_aligned()
5093 std::int32_t destination_stride, in qnt_2x8_3_aligned()
5094 std::int32_t multiplicative_offset, in qnt_2x8_3_aligned()
5095 std::int32_t rounding_offset, std::int32_t shift) { in qnt_2x8_3_aligned()
5172 void qnt_2x8_4_aligned(const std::int32_t* source, std::int32_t count, in qnt_2x8_4_aligned()
5173 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_4_aligned()
5175 std::int32_t destination_stride, in qnt_2x8_4_aligned()
5176 std::int32_t multiplicative_offset, in qnt_2x8_4_aligned()
5177 std::int32_t rounding_offset, std::int32_t shift) { in qnt_2x8_4_aligned()
5250 void qnt_2x8_5_aligned(const std::int32_t* source, std::int32_t count, in qnt_2x8_5_aligned()
5251 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_5_aligned()
5253 std::int32_t destination_stride, in qnt_2x8_5_aligned()
5254 std::int32_t multiplicative_offset, in qnt_2x8_5_aligned()
5255 std::int32_t rounding_offset, std::int32_t shift) { in qnt_2x8_5_aligned()
5342 void qnt_2x8_6_aligned(const std::int32_t* source, std::int32_t count, in qnt_2x8_6_aligned()
5343 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_6_aligned()
5345 std::int32_t destination_stride, in qnt_2x8_6_aligned()
5346 std::int32_t multiplicative_offset, in qnt_2x8_6_aligned()
5347 std::int32_t rounding_offset, std::int32_t shift) { in qnt_2x8_6_aligned()
5432 void qnt_2x8_7_aligned(const std::int32_t* source, std::int32_t count, in qnt_2x8_7_aligned()
5433 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_7_aligned()
5435 std::int32_t destination_stride, in qnt_2x8_7_aligned()
5436 std::int32_t multiplicative_offset, in qnt_2x8_7_aligned()
5437 std::int32_t rounding_offset, std::int32_t shift) { in qnt_2x8_7_aligned()
5526 void qnt_3x8_aligned(const std::int32_t* source, std::int32_t count, in qnt_3x8_aligned()
5527 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_aligned()
5528 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8_aligned()
5529 std::int32_t multiplicative_offset, in qnt_3x8_aligned()
5530 std::int32_t rounding_offset, std::int32_t shift) { in qnt_3x8_aligned()
5602 void qnt_3x8_1_aligned(const std::int32_t* source, std::int32_t count, in qnt_3x8_1_aligned()
5603 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_1_aligned()
5605 std::int32_t destination_stride, in qnt_3x8_1_aligned()
5606 std::int32_t multiplicative_offset, in qnt_3x8_1_aligned()
5607 std::int32_t rounding_offset, std::int32_t shift) { in qnt_3x8_1_aligned()
5706 void qnt_3x8_2_aligned(const std::int32_t* source, std::int32_t count, in qnt_3x8_2_aligned()
5707 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_2_aligned()
5709 std::int32_t destination_stride, in qnt_3x8_2_aligned()
5710 std::int32_t multiplicative_offset, in qnt_3x8_2_aligned()
5711 std::int32_t rounding_offset, std::int32_t shift) { in qnt_3x8_2_aligned()
5810 void qnt_3x8_3_aligned(const std::int32_t* source, std::int32_t count, in qnt_3x8_3_aligned()
5811 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_3_aligned()
5813 std::int32_t destination_stride, in qnt_3x8_3_aligned()
5814 std::int32_t multiplicative_offset, in qnt_3x8_3_aligned()
5815 std::int32_t rounding_offset, std::int32_t shift) { in qnt_3x8_3_aligned()
5920 void qnt_3x8_4_aligned(const std::int32_t* source, std::int32_t count, in qnt_3x8_4_aligned()
5921 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_4_aligned()
5923 std::int32_t destination_stride, in qnt_3x8_4_aligned()
5924 std::int32_t multiplicative_offset, in qnt_3x8_4_aligned()
5925 std::int32_t rounding_offset, std::int32_t shift) { in qnt_3x8_4_aligned()
6024 void qnt_3x8_5_aligned(const std::int32_t* source, std::int32_t count, in qnt_3x8_5_aligned()
6025 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_5_aligned()
6027 std::int32_t destination_stride, in qnt_3x8_5_aligned()
6028 std::int32_t multiplicative_offset, in qnt_3x8_5_aligned()
6029 std::int32_t rounding_offset, std::int32_t shift) { in qnt_3x8_5_aligned()
6149 void qnt_3x8_6_aligned(const std::int32_t* source, std::int32_t count, in qnt_3x8_6_aligned()
6150 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_6_aligned()
6152 std::int32_t destination_stride, in qnt_3x8_6_aligned()
6153 std::int32_t multiplicative_offset, in qnt_3x8_6_aligned()
6154 std::int32_t rounding_offset, std::int32_t shift) { in qnt_3x8_6_aligned()
6271 void qnt_3x8_7_aligned(const std::int32_t* source, std::int32_t count, in qnt_3x8_7_aligned()
6272 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_7_aligned()
6274 std::int32_t destination_stride, in qnt_3x8_7_aligned()
6275 std::int32_t multiplicative_offset, in qnt_3x8_7_aligned()
6276 std::int32_t rounding_offset, std::int32_t shift) { in qnt_3x8_7_aligned()
6399 void qnt_1x8(const std::int32_t* source, std::int32_t count, in qnt_1x8()
6400 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8()
6401 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8()
6402 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_1x8()
6403 std::int32_t shift) { in qnt_1x8()
6439 void qnt_1x8_1(const std::int32_t* source, std::int32_t count, in qnt_1x8_1()
6440 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_1()
6441 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8_1()
6442 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_1x8_1()
6443 std::int32_t shift) { in qnt_1x8_1()
6490 void qnt_1x8_2(const std::int32_t* source, std::int32_t count, in qnt_1x8_2()
6491 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_2()
6492 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8_2()
6493 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_1x8_2()
6494 std::int32_t shift) { in qnt_1x8_2()
6541 void qnt_1x8_3(const std::int32_t* source, std::int32_t count, in qnt_1x8_3()
6542 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_3()
6543 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8_3()
6544 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_1x8_3()
6545 std::int32_t shift) { in qnt_1x8_3()
6594 void qnt_1x8_4(const std::int32_t* source, std::int32_t count, in qnt_1x8_4()
6595 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_4()
6596 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8_4()
6597 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_1x8_4()
6598 std::int32_t shift) { in qnt_1x8_4()
6645 void qnt_1x8_5(const std::int32_t* source, std::int32_t count, in qnt_1x8_5()
6646 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_5()
6647 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8_5()
6648 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_1x8_5()
6649 std::int32_t shift) { in qnt_1x8_5()
6703 void qnt_1x8_6(const std::int32_t* source, std::int32_t count, in qnt_1x8_6()
6704 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_6()
6705 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8_6()
6706 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_1x8_6()
6707 std::int32_t shift) { in qnt_1x8_6()
6760 void qnt_1x8_7(const std::int32_t* source, std::int32_t count, in qnt_1x8_7()
6761 std::int32_t stride, const std::int32_t* offsets, in qnt_1x8_7()
6762 std::uint8_t* destination, std::int32_t destination_stride, in qnt_1x8_7()
6763 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_1x8_7()
6764 std::int32_t shift) { in qnt_1x8_7()
6819 void qnt_2x8(const std::int32_t* source, std::int32_t count, in qnt_2x8()
6820 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8()
6821 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8()
6822 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_2x8()
6823 std::int32_t shift) { in qnt_2x8()
6877 void qnt_2x8_1(const std::int32_t* source, std::int32_t count, in qnt_2x8_1()
6878 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_1()
6879 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8_1()
6880 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_2x8_1()
6881 std::int32_t shift) { in qnt_2x8_1()
6954 void qnt_2x8_2(const std::int32_t* source, std::int32_t count, in qnt_2x8_2()
6955 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_2()
6956 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8_2()
6957 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_2x8_2()
6958 std::int32_t shift) { in qnt_2x8_2()
7031 void qnt_2x8_3(const std::int32_t* source, std::int32_t count, in qnt_2x8_3()
7032 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_3()
7033 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8_3()
7034 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_2x8_3()
7035 std::int32_t shift) { in qnt_2x8_3()
7112 void qnt_2x8_4(const std::int32_t* source, std::int32_t count, in qnt_2x8_4()
7113 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_4()
7114 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8_4()
7115 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_2x8_4()
7116 std::int32_t shift) { in qnt_2x8_4()
7189 void qnt_2x8_5(const std::int32_t* source, std::int32_t count, in qnt_2x8_5()
7190 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_5()
7191 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8_5()
7192 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_2x8_5()
7193 std::int32_t shift) { in qnt_2x8_5()
7280 void qnt_2x8_6(const std::int32_t* source, std::int32_t count, in qnt_2x8_6()
7281 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_6()
7282 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8_6()
7283 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_2x8_6()
7284 std::int32_t shift) { in qnt_2x8_6()
7369 void qnt_2x8_7(const std::int32_t* source, std::int32_t count, in qnt_2x8_7()
7370 std::int32_t stride, const std::int32_t* offsets, in qnt_2x8_7()
7371 std::uint8_t* destination, std::int32_t destination_stride, in qnt_2x8_7()
7372 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_2x8_7()
7373 std::int32_t shift) { in qnt_2x8_7()
7462 void qnt_3x8(const std::int32_t* source, std::int32_t count, in qnt_3x8()
7463 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8()
7464 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8()
7465 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_3x8()
7466 std::int32_t shift) { in qnt_3x8()
7538 void qnt_3x8_1(const std::int32_t* source, std::int32_t count, in qnt_3x8_1()
7539 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_1()
7540 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8_1()
7541 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_3x8_1()
7542 std::int32_t shift) { in qnt_3x8_1()
7641 void qnt_3x8_2(const std::int32_t* source, std::int32_t count, in qnt_3x8_2()
7642 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_2()
7643 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8_2()
7644 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_3x8_2()
7645 std::int32_t shift) { in qnt_3x8_2()
7744 void qnt_3x8_3(const std::int32_t* source, std::int32_t count, in qnt_3x8_3()
7745 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_3()
7746 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8_3()
7747 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_3x8_3()
7748 std::int32_t shift) { in qnt_3x8_3()
7853 void qnt_3x8_4(const std::int32_t* source, std::int32_t count, in qnt_3x8_4()
7854 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_4()
7855 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8_4()
7856 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_3x8_4()
7857 std::int32_t shift) { in qnt_3x8_4()
7956 void qnt_3x8_5(const std::int32_t* source, std::int32_t count, in qnt_3x8_5()
7957 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_5()
7958 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8_5()
7959 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_3x8_5()
7960 std::int32_t shift) { in qnt_3x8_5()
8080 void qnt_3x8_6(const std::int32_t* source, std::int32_t count, in qnt_3x8_6()
8081 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_6()
8082 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8_6()
8083 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_3x8_6()
8084 std::int32_t shift) { in qnt_3x8_6()
8201 void qnt_3x8_7(const std::int32_t* source, std::int32_t count, in qnt_3x8_7()
8202 std::int32_t stride, const std::int32_t* offsets, in qnt_3x8_7()
8203 std::uint8_t* destination, std::int32_t destination_stride, in qnt_3x8_7()
8204 std::int32_t multiplicative_offset, std::int32_t rounding_offset, in qnt_3x8_7()
8205 std::int32_t shift) { in qnt_3x8_7()
8328 void multi_qnt_1x8_aligned(const std::int32_t* source, std::int32_t count, in multi_qnt_1x8_aligned()
8329 std::int32_t stride, const std::int32_t* offsets, in multi_qnt_1x8_aligned()
8331 std::int32_t destination_stride, in multi_qnt_1x8_aligned()
8332 std::int32_t multiplicative_offset, in multi_qnt_1x8_aligned()
8333 std::int32_t rounding_offset, std::int32_t shift) { in multi_qnt_1x8_aligned()
8378 void multi_qnt_2x8_aligned(const std::int32_t* source, std::int32_t count, in multi_qnt_2x8_aligned()
8379 std::int32_t stride, const std::int32_t* offsets, in multi_qnt_2x8_aligned()
8381 std::int32_t destination_stride, in multi_qnt_2x8_aligned()
8382 std::int32_t multiplicative_offset, in multi_qnt_2x8_aligned()
8383 std::int32_t rounding_offset, std::int32_t shift) { in multi_qnt_2x8_aligned()
8428 void multi_qnt_3x8_aligned(const std::int32_t* source, std::int32_t count, in multi_qnt_3x8_aligned()
8429 std::int32_t stride, const std::int32_t* offsets, in multi_qnt_3x8_aligned()
8431 std::int32_t destination_stride, in multi_qnt_3x8_aligned()
8432 std::int32_t multiplicative_offset, in multi_qnt_3x8_aligned()
8433 std::int32_t rounding_offset, std::int32_t shift) { in multi_qnt_3x8_aligned()
8478 void multi_qnt_1x8(const std::int32_t* source, std::int32_t count, in multi_qnt_1x8()
8479 std::int32_t stride, const std::int32_t* offsets, in multi_qnt_1x8()
8480 std::uint8_t* destination, std::int32_t destination_stride, in multi_qnt_1x8()
8481 std::int32_t multiplicative_offset, in multi_qnt_1x8()
8482 std::int32_t rounding_offset, std::int32_t shift) { in multi_qnt_1x8()
8519 void multi_qnt_2x8(const std::int32_t* source, std::int32_t count, in multi_qnt_2x8()
8520 std::int32_t stride, const std::int32_t* offsets, in multi_qnt_2x8()
8521 std::uint8_t* destination, std::int32_t destination_stride, in multi_qnt_2x8()
8522 std::int32_t multiplicative_offset, in multi_qnt_2x8()
8523 std::int32_t rounding_offset, std::int32_t shift) { in multi_qnt_2x8()
8560 void multi_qnt_3x8(const std::int32_t* source, std::int32_t count, in multi_qnt_3x8()
8561 std::int32_t stride, const std::int32_t* offsets, in multi_qnt_3x8()
8562 std::uint8_t* destination, std::int32_t destination_stride, in multi_qnt_3x8()
8563 std::int32_t multiplicative_offset, in multi_qnt_3x8()
8564 std::int32_t rounding_offset, std::int32_t shift) { in multi_qnt_3x8()
8602 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_0_0_aligned()
8603 std::int32_t n, std::int32_t k, in gemm_q8_0_0_0_aligned()
8604 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_0_0_aligned()
8605 std::int32_t result_offset, in gemm_q8_0_0_0_aligned()
8606 std::int32_t multiplicative_offset, in gemm_q8_0_0_0_aligned()
8607 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_0_0_aligned()
8608 std::int32_t result_stride) { in gemm_q8_0_0_0_aligned()
8609 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_0_aligned()
8610 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_0_aligned()
8611 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_0_aligned()
8612 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_0_aligned()
8613 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_0_aligned()
8614 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_0_aligned()
8618 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_0_aligned()
8619 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_0_aligned()
8622 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_0_aligned()
8624 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_0_aligned()
8625 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_0_aligned()
8626 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_0_aligned()
8629 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_0_aligned()
8630 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_0_aligned()
8657 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_0_1_aligned()
8658 std::int32_t n, std::int32_t k, in gemm_q8_0_0_1_aligned()
8659 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_0_1_aligned()
8660 std::int32_t result_offset, in gemm_q8_0_0_1_aligned()
8661 std::int32_t multiplicative_offset, in gemm_q8_0_0_1_aligned()
8662 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_0_1_aligned()
8663 std::int32_t result_stride) { in gemm_q8_0_0_1_aligned()
8664 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_1_aligned()
8665 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_1_aligned()
8666 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_1_aligned()
8667 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_1_aligned()
8668 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_1_aligned()
8669 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_1_aligned()
8673 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_1_aligned()
8674 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_1_aligned()
8677 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_1_aligned()
8679 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_1_aligned()
8680 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_1_aligned()
8681 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_1_aligned()
8684 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_1_aligned()
8685 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_1_aligned()
8712 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_0_2_aligned()
8713 std::int32_t n, std::int32_t k, in gemm_q8_0_0_2_aligned()
8714 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_0_2_aligned()
8715 std::int32_t result_offset, in gemm_q8_0_0_2_aligned()
8716 std::int32_t multiplicative_offset, in gemm_q8_0_0_2_aligned()
8717 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_0_2_aligned()
8718 std::int32_t result_stride) { in gemm_q8_0_0_2_aligned()
8719 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_2_aligned()
8720 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_2_aligned()
8721 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_2_aligned()
8722 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_2_aligned()
8723 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_2_aligned()
8724 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_2_aligned()
8728 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_2_aligned()
8729 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_2_aligned()
8732 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_2_aligned()
8734 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_2_aligned()
8735 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_2_aligned()
8736 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_2_aligned()
8739 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_2_aligned()
8740 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_2_aligned()
8767 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_0_3_aligned()
8768 std::int32_t n, std::int32_t k, in gemm_q8_0_0_3_aligned()
8769 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_0_3_aligned()
8770 std::int32_t result_offset, in gemm_q8_0_0_3_aligned()
8771 std::int32_t multiplicative_offset, in gemm_q8_0_0_3_aligned()
8772 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_0_3_aligned()
8773 std::int32_t result_stride) { in gemm_q8_0_0_3_aligned()
8774 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_3_aligned()
8775 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_3_aligned()
8776 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_3_aligned()
8777 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_3_aligned()
8778 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_3_aligned()
8779 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_3_aligned()
8783 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_3_aligned()
8784 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_3_aligned()
8787 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_3_aligned()
8789 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_3_aligned()
8790 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_3_aligned()
8791 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_3_aligned()
8794 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_3_aligned()
8795 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_3_aligned()
8822 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_0_4_aligned()
8823 std::int32_t n, std::int32_t k, in gemm_q8_0_0_4_aligned()
8824 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_0_4_aligned()
8825 std::int32_t result_offset, in gemm_q8_0_0_4_aligned()
8826 std::int32_t multiplicative_offset, in gemm_q8_0_0_4_aligned()
8827 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_0_4_aligned()
8828 std::int32_t result_stride) { in gemm_q8_0_0_4_aligned()
8829 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_4_aligned()
8830 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_4_aligned()
8831 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_4_aligned()
8832 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_4_aligned()
8833 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_4_aligned()
8834 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_4_aligned()
8838 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_4_aligned()
8839 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_4_aligned()
8842 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_4_aligned()
8844 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_4_aligned()
8845 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_4_aligned()
8846 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_4_aligned()
8849 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_4_aligned()
8850 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_4_aligned()
8877 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_0_5_aligned()
8878 std::int32_t n, std::int32_t k, in gemm_q8_0_0_5_aligned()
8879 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_0_5_aligned()
8880 std::int32_t result_offset, in gemm_q8_0_0_5_aligned()
8881 std::int32_t multiplicative_offset, in gemm_q8_0_0_5_aligned()
8882 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_0_5_aligned()
8883 std::int32_t result_stride) { in gemm_q8_0_0_5_aligned()
8884 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_5_aligned()
8885 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_5_aligned()
8886 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_5_aligned()
8887 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_5_aligned()
8888 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_5_aligned()
8889 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_5_aligned()
8893 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_5_aligned()
8894 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_5_aligned()
8897 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_5_aligned()
8899 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_5_aligned()
8900 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_5_aligned()
8901 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_5_aligned()
8904 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_5_aligned()
8905 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_5_aligned()
8932 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_0_6_aligned()
8933 std::int32_t n, std::int32_t k, in gemm_q8_0_0_6_aligned()
8934 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_0_6_aligned()
8935 std::int32_t result_offset, in gemm_q8_0_0_6_aligned()
8936 std::int32_t multiplicative_offset, in gemm_q8_0_0_6_aligned()
8937 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_0_6_aligned()
8938 std::int32_t result_stride) { in gemm_q8_0_0_6_aligned()
8939 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_6_aligned()
8940 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_6_aligned()
8941 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_6_aligned()
8942 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_6_aligned()
8943 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_6_aligned()
8944 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_6_aligned()
8948 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_6_aligned()
8949 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_6_aligned()
8952 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_6_aligned()
8954 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_6_aligned()
8955 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_6_aligned()
8956 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_6_aligned()
8959 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_6_aligned()
8960 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_6_aligned()
8987 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_0_7_aligned()
8988 std::int32_t n, std::int32_t k, in gemm_q8_0_0_7_aligned()
8989 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_0_7_aligned()
8990 std::int32_t result_offset, in gemm_q8_0_0_7_aligned()
8991 std::int32_t multiplicative_offset, in gemm_q8_0_0_7_aligned()
8992 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_0_7_aligned()
8993 std::int32_t result_stride) { in gemm_q8_0_0_7_aligned()
8994 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_7_aligned()
8995 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_7_aligned()
8996 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_7_aligned()
8997 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_7_aligned()
8998 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_7_aligned()
8999 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_7_aligned()
9003 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_7_aligned()
9004 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_7_aligned()
9007 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_7_aligned()
9009 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_7_aligned()
9010 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_7_aligned()
9011 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_7_aligned()
9014 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_7_aligned()
9015 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_7_aligned()
9042 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_1_0_aligned()
9043 std::int32_t n, std::int32_t k, in gemm_q8_0_1_0_aligned()
9044 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_1_0_aligned()
9045 std::int32_t result_offset, in gemm_q8_0_1_0_aligned()
9046 std::int32_t multiplicative_offset, in gemm_q8_0_1_0_aligned()
9047 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_1_0_aligned()
9048 std::int32_t result_stride) { in gemm_q8_0_1_0_aligned()
9049 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_0_aligned()
9050 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_0_aligned()
9051 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_0_aligned()
9052 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_0_aligned()
9053 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_0_aligned()
9054 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_0_aligned()
9058 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_0_aligned()
9059 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_0_aligned()
9062 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_0_aligned()
9064 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_0_aligned()
9065 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_0_aligned()
9066 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_0_aligned()
9069 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_0_aligned()
9070 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_0_aligned()
9100 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_1_1_aligned()
9101 std::int32_t n, std::int32_t k, in gemm_q8_0_1_1_aligned()
9102 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_1_1_aligned()
9103 std::int32_t result_offset, in gemm_q8_0_1_1_aligned()
9104 std::int32_t multiplicative_offset, in gemm_q8_0_1_1_aligned()
9105 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_1_1_aligned()
9106 std::int32_t result_stride) { in gemm_q8_0_1_1_aligned()
9107 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_1_aligned()
9108 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_1_aligned()
9109 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_1_aligned()
9110 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_1_aligned()
9111 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_1_aligned()
9112 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_1_aligned()
9116 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_1_aligned()
9117 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_1_aligned()
9120 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_1_aligned()
9122 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_1_aligned()
9123 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_1_aligned()
9124 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_1_aligned()
9127 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_1_aligned()
9128 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_1_aligned()
9158 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_1_2_aligned()
9159 std::int32_t n, std::int32_t k, in gemm_q8_0_1_2_aligned()
9160 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_1_2_aligned()
9161 std::int32_t result_offset, in gemm_q8_0_1_2_aligned()
9162 std::int32_t multiplicative_offset, in gemm_q8_0_1_2_aligned()
9163 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_1_2_aligned()
9164 std::int32_t result_stride) { in gemm_q8_0_1_2_aligned()
9165 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_2_aligned()
9166 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_2_aligned()
9167 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_2_aligned()
9168 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_2_aligned()
9169 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_2_aligned()
9170 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_2_aligned()
9174 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_2_aligned()
9175 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_2_aligned()
9178 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_2_aligned()
9180 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_2_aligned()
9181 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_2_aligned()
9182 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_2_aligned()
9185 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_2_aligned()
9186 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_2_aligned()
9216 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_1_3_aligned()
9217 std::int32_t n, std::int32_t k, in gemm_q8_0_1_3_aligned()
9218 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_1_3_aligned()
9219 std::int32_t result_offset, in gemm_q8_0_1_3_aligned()
9220 std::int32_t multiplicative_offset, in gemm_q8_0_1_3_aligned()
9221 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_1_3_aligned()
9222 std::int32_t result_stride) { in gemm_q8_0_1_3_aligned()
9223 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_3_aligned()
9224 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_3_aligned()
9225 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_3_aligned()
9226 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_3_aligned()
9227 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_3_aligned()
9228 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_3_aligned()
9232 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_3_aligned()
9233 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_3_aligned()
9236 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_3_aligned()
9238 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_3_aligned()
9239 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_3_aligned()
9240 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_3_aligned()
9243 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_3_aligned()
9244 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_3_aligned()
9274 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_1_4_aligned()
9275 std::int32_t n, std::int32_t k, in gemm_q8_0_1_4_aligned()
9276 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_1_4_aligned()
9277 std::int32_t result_offset, in gemm_q8_0_1_4_aligned()
9278 std::int32_t multiplicative_offset, in gemm_q8_0_1_4_aligned()
9279 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_1_4_aligned()
9280 std::int32_t result_stride) { in gemm_q8_0_1_4_aligned()
9281 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_4_aligned()
9282 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_4_aligned()
9283 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_4_aligned()
9284 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_4_aligned()
9285 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_4_aligned()
9286 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_4_aligned()
9290 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_4_aligned()
9291 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_4_aligned()
9294 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_4_aligned()
9296 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_4_aligned()
9297 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_4_aligned()
9298 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_4_aligned()
9301 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_4_aligned()
9302 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_4_aligned()
9332 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_1_5_aligned()
9333 std::int32_t n, std::int32_t k, in gemm_q8_0_1_5_aligned()
9334 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_1_5_aligned()
9335 std::int32_t result_offset, in gemm_q8_0_1_5_aligned()
9336 std::int32_t multiplicative_offset, in gemm_q8_0_1_5_aligned()
9337 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_1_5_aligned()
9338 std::int32_t result_stride) { in gemm_q8_0_1_5_aligned()
9339 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_5_aligned()
9340 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_5_aligned()
9341 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_5_aligned()
9342 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_5_aligned()
9343 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_5_aligned()
9344 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_5_aligned()
9348 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_5_aligned()
9349 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_5_aligned()
9352 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_5_aligned()
9354 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_5_aligned()
9355 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_5_aligned()
9356 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_5_aligned()
9359 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_5_aligned()
9360 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_5_aligned()
9390 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_1_6_aligned()
9391 std::int32_t n, std::int32_t k, in gemm_q8_0_1_6_aligned()
9392 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_1_6_aligned()
9393 std::int32_t result_offset, in gemm_q8_0_1_6_aligned()
9394 std::int32_t multiplicative_offset, in gemm_q8_0_1_6_aligned()
9395 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_1_6_aligned()
9396 std::int32_t result_stride) { in gemm_q8_0_1_6_aligned()
9397 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_6_aligned()
9398 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_6_aligned()
9399 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_6_aligned()
9400 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_6_aligned()
9401 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_6_aligned()
9402 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_6_aligned()
9406 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_6_aligned()
9407 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_6_aligned()
9410 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_6_aligned()
9412 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_6_aligned()
9413 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_6_aligned()
9414 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_6_aligned()
9417 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_6_aligned()
9418 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_6_aligned()
9448 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_1_7_aligned()
9449 std::int32_t n, std::int32_t k, in gemm_q8_0_1_7_aligned()
9450 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_1_7_aligned()
9451 std::int32_t result_offset, in gemm_q8_0_1_7_aligned()
9452 std::int32_t multiplicative_offset, in gemm_q8_0_1_7_aligned()
9453 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_1_7_aligned()
9454 std::int32_t result_stride) { in gemm_q8_0_1_7_aligned()
9455 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_7_aligned()
9456 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_7_aligned()
9457 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_7_aligned()
9458 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_7_aligned()
9459 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_7_aligned()
9460 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_7_aligned()
9464 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_7_aligned()
9465 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_7_aligned()
9468 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_7_aligned()
9470 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_7_aligned()
9471 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_7_aligned()
9472 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_7_aligned()
9475 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_7_aligned()
9476 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_7_aligned()
9506 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_2_0_aligned()
9507 std::int32_t n, std::int32_t k, in gemm_q8_0_2_0_aligned()
9508 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_2_0_aligned()
9509 std::int32_t result_offset, in gemm_q8_0_2_0_aligned()
9510 std::int32_t multiplicative_offset, in gemm_q8_0_2_0_aligned()
9511 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_2_0_aligned()
9512 std::int32_t result_stride) { in gemm_q8_0_2_0_aligned()
9513 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_0_aligned()
9514 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_0_aligned()
9515 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_0_aligned()
9516 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_0_aligned()
9517 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_0_aligned()
9518 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_0_aligned()
9522 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_0_aligned()
9523 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_0_aligned()
9526 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_0_aligned()
9528 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_0_aligned()
9529 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_0_aligned()
9530 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_0_aligned()
9533 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_0_aligned()
9534 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_0_aligned()
9564 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_2_1_aligned()
9565 std::int32_t n, std::int32_t k, in gemm_q8_0_2_1_aligned()
9566 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_2_1_aligned()
9567 std::int32_t result_offset, in gemm_q8_0_2_1_aligned()
9568 std::int32_t multiplicative_offset, in gemm_q8_0_2_1_aligned()
9569 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_2_1_aligned()
9570 std::int32_t result_stride) { in gemm_q8_0_2_1_aligned()
9571 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_1_aligned()
9572 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_1_aligned()
9573 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_1_aligned()
9574 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_1_aligned()
9575 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_1_aligned()
9576 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_1_aligned()
9580 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_1_aligned()
9581 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_1_aligned()
9584 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_1_aligned()
9586 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_1_aligned()
9587 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_1_aligned()
9588 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_1_aligned()
9591 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_1_aligned()
9592 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_1_aligned()
9622 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_2_2_aligned()
9623 std::int32_t n, std::int32_t k, in gemm_q8_0_2_2_aligned()
9624 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_2_2_aligned()
9625 std::int32_t result_offset, in gemm_q8_0_2_2_aligned()
9626 std::int32_t multiplicative_offset, in gemm_q8_0_2_2_aligned()
9627 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_2_2_aligned()
9628 std::int32_t result_stride) { in gemm_q8_0_2_2_aligned()
9629 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_2_aligned()
9630 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_2_aligned()
9631 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_2_aligned()
9632 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_2_aligned()
9633 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_2_aligned()
9634 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_2_aligned()
9638 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_2_aligned()
9639 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_2_aligned()
9642 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_2_aligned()
9644 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_2_aligned()
9645 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_2_aligned()
9646 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_2_aligned()
9649 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_2_aligned()
9650 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_2_aligned()
9680 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_2_3_aligned()
9681 std::int32_t n, std::int32_t k, in gemm_q8_0_2_3_aligned()
9682 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_2_3_aligned()
9683 std::int32_t result_offset, in gemm_q8_0_2_3_aligned()
9684 std::int32_t multiplicative_offset, in gemm_q8_0_2_3_aligned()
9685 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_2_3_aligned()
9686 std::int32_t result_stride) { in gemm_q8_0_2_3_aligned()
9687 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_3_aligned()
9688 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_3_aligned()
9689 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_3_aligned()
9690 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_3_aligned()
9691 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_3_aligned()
9692 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_3_aligned()
9696 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_3_aligned()
9697 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_3_aligned()
9700 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_3_aligned()
9702 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_3_aligned()
9703 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_3_aligned()
9704 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_3_aligned()
9707 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_3_aligned()
9708 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_3_aligned()
9738 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_2_4_aligned()
9739 std::int32_t n, std::int32_t k, in gemm_q8_0_2_4_aligned()
9740 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_2_4_aligned()
9741 std::int32_t result_offset, in gemm_q8_0_2_4_aligned()
9742 std::int32_t multiplicative_offset, in gemm_q8_0_2_4_aligned()
9743 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_2_4_aligned()
9744 std::int32_t result_stride) { in gemm_q8_0_2_4_aligned()
9745 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_4_aligned()
9746 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_4_aligned()
9747 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_4_aligned()
9748 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_4_aligned()
9749 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_4_aligned()
9750 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_4_aligned()
9754 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_4_aligned()
9755 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_4_aligned()
9758 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_4_aligned()
9760 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_4_aligned()
9761 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_4_aligned()
9762 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_4_aligned()
9765 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_4_aligned()
9766 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_4_aligned()
9796 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_2_5_aligned()
9797 std::int32_t n, std::int32_t k, in gemm_q8_0_2_5_aligned()
9798 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_2_5_aligned()
9799 std::int32_t result_offset, in gemm_q8_0_2_5_aligned()
9800 std::int32_t multiplicative_offset, in gemm_q8_0_2_5_aligned()
9801 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_2_5_aligned()
9802 std::int32_t result_stride) { in gemm_q8_0_2_5_aligned()
9803 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_5_aligned()
9804 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_5_aligned()
9805 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_5_aligned()
9806 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_5_aligned()
9807 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_5_aligned()
9808 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_5_aligned()
9812 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_5_aligned()
9813 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_5_aligned()
9816 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_5_aligned()
9818 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_5_aligned()
9819 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_5_aligned()
9820 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_5_aligned()
9823 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_5_aligned()
9824 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_5_aligned()
9854 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_2_6_aligned()
9855 std::int32_t n, std::int32_t k, in gemm_q8_0_2_6_aligned()
9856 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_2_6_aligned()
9857 std::int32_t result_offset, in gemm_q8_0_2_6_aligned()
9858 std::int32_t multiplicative_offset, in gemm_q8_0_2_6_aligned()
9859 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_2_6_aligned()
9860 std::int32_t result_stride) { in gemm_q8_0_2_6_aligned()
9861 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_6_aligned()
9862 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_6_aligned()
9863 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_6_aligned()
9864 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_6_aligned()
9865 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_6_aligned()
9866 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_6_aligned()
9870 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_6_aligned()
9871 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_6_aligned()
9874 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_6_aligned()
9876 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_6_aligned()
9877 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_6_aligned()
9878 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_6_aligned()
9881 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_6_aligned()
9882 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_6_aligned()
9912 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_0_2_7_aligned()
9913 std::int32_t n, std::int32_t k, in gemm_q8_0_2_7_aligned()
9914 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_0_2_7_aligned()
9915 std::int32_t result_offset, in gemm_q8_0_2_7_aligned()
9916 std::int32_t multiplicative_offset, in gemm_q8_0_2_7_aligned()
9917 std::int32_t shift, std::uint8_t* result, in gemm_q8_0_2_7_aligned()
9918 std::int32_t result_stride) { in gemm_q8_0_2_7_aligned()
9919 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_7_aligned()
9920 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_7_aligned()
9921 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_7_aligned()
9922 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_7_aligned()
9923 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_7_aligned()
9924 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_7_aligned()
9928 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_7_aligned()
9929 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_7_aligned()
9932 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_7_aligned()
9934 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_7_aligned()
9935 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_7_aligned()
9936 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_7_aligned()
9939 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_7_aligned()
9940 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_7_aligned()
9970 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_0_0_aligned()
9971 std::int32_t n, std::int32_t k, in gemm_q8_1_0_0_aligned()
9972 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_0_0_aligned()
9973 std::int32_t result_offset, in gemm_q8_1_0_0_aligned()
9974 std::int32_t multiplicative_offset, in gemm_q8_1_0_0_aligned()
9975 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_0_0_aligned()
9976 std::int32_t result_stride) { in gemm_q8_1_0_0_aligned()
9977 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_0_aligned()
9978 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_0_aligned()
9979 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_0_aligned()
9980 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_0_aligned()
9981 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_0_aligned()
9982 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_0_aligned()
9986 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_0_aligned()
9987 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_0_aligned()
9988 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_0_aligned()
9989 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_0_aligned()
9992 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_0_aligned()
9994 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_0_aligned()
9995 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_0_aligned()
9996 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_0_aligned()
9999 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_0_aligned()
10000 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_0_aligned()
10040 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_0_1_aligned()
10041 std::int32_t n, std::int32_t k, in gemm_q8_1_0_1_aligned()
10042 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_0_1_aligned()
10043 std::int32_t result_offset, in gemm_q8_1_0_1_aligned()
10044 std::int32_t multiplicative_offset, in gemm_q8_1_0_1_aligned()
10045 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_0_1_aligned()
10046 std::int32_t result_stride) { in gemm_q8_1_0_1_aligned()
10047 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_1_aligned()
10048 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_1_aligned()
10049 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_1_aligned()
10050 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_1_aligned()
10051 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_1_aligned()
10052 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_1_aligned()
10056 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_1_aligned()
10057 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_1_aligned()
10058 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_1_aligned()
10059 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_1_aligned()
10062 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_1_aligned()
10064 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_1_aligned()
10065 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_1_aligned()
10066 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_1_aligned()
10069 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_1_aligned()
10070 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_1_aligned()
10110 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_0_2_aligned()
10111 std::int32_t n, std::int32_t k, in gemm_q8_1_0_2_aligned()
10112 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_0_2_aligned()
10113 std::int32_t result_offset, in gemm_q8_1_0_2_aligned()
10114 std::int32_t multiplicative_offset, in gemm_q8_1_0_2_aligned()
10115 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_0_2_aligned()
10116 std::int32_t result_stride) { in gemm_q8_1_0_2_aligned()
10117 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_2_aligned()
10118 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_2_aligned()
10119 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_2_aligned()
10120 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_2_aligned()
10121 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_2_aligned()
10122 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_2_aligned()
10126 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_2_aligned()
10127 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_2_aligned()
10128 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_2_aligned()
10129 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_2_aligned()
10132 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_2_aligned()
10134 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_2_aligned()
10135 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_2_aligned()
10136 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_2_aligned()
10139 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_2_aligned()
10140 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_2_aligned()
10180 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_0_3_aligned()
10181 std::int32_t n, std::int32_t k, in gemm_q8_1_0_3_aligned()
10182 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_0_3_aligned()
10183 std::int32_t result_offset, in gemm_q8_1_0_3_aligned()
10184 std::int32_t multiplicative_offset, in gemm_q8_1_0_3_aligned()
10185 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_0_3_aligned()
10186 std::int32_t result_stride) { in gemm_q8_1_0_3_aligned()
10187 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_3_aligned()
10188 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_3_aligned()
10189 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_3_aligned()
10190 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_3_aligned()
10191 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_3_aligned()
10192 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_3_aligned()
10196 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_3_aligned()
10197 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_3_aligned()
10198 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_3_aligned()
10199 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_3_aligned()
10202 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_3_aligned()
10204 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_3_aligned()
10205 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_3_aligned()
10206 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_3_aligned()
10209 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_3_aligned()
10210 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_3_aligned()
10250 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_0_4_aligned()
10251 std::int32_t n, std::int32_t k, in gemm_q8_1_0_4_aligned()
10252 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_0_4_aligned()
10253 std::int32_t result_offset, in gemm_q8_1_0_4_aligned()
10254 std::int32_t multiplicative_offset, in gemm_q8_1_0_4_aligned()
10255 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_0_4_aligned()
10256 std::int32_t result_stride) { in gemm_q8_1_0_4_aligned()
10257 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_4_aligned()
10258 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_4_aligned()
10259 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_4_aligned()
10260 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_4_aligned()
10261 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_4_aligned()
10262 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_4_aligned()
10266 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_4_aligned()
10267 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_4_aligned()
10268 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_4_aligned()
10269 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_4_aligned()
10272 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_4_aligned()
10274 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_4_aligned()
10275 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_4_aligned()
10276 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_4_aligned()
10279 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_4_aligned()
10280 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_4_aligned()
10320 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_0_5_aligned()
10321 std::int32_t n, std::int32_t k, in gemm_q8_1_0_5_aligned()
10322 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_0_5_aligned()
10323 std::int32_t result_offset, in gemm_q8_1_0_5_aligned()
10324 std::int32_t multiplicative_offset, in gemm_q8_1_0_5_aligned()
10325 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_0_5_aligned()
10326 std::int32_t result_stride) { in gemm_q8_1_0_5_aligned()
10327 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_5_aligned()
10328 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_5_aligned()
10329 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_5_aligned()
10330 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_5_aligned()
10331 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_5_aligned()
10332 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_5_aligned()
10336 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_5_aligned()
10337 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_5_aligned()
10338 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_5_aligned()
10339 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_5_aligned()
10342 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_5_aligned()
10344 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_5_aligned()
10345 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_5_aligned()
10346 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_5_aligned()
10349 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_5_aligned()
10350 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_5_aligned()
10390 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_0_6_aligned()
10391 std::int32_t n, std::int32_t k, in gemm_q8_1_0_6_aligned()
10392 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_0_6_aligned()
10393 std::int32_t result_offset, in gemm_q8_1_0_6_aligned()
10394 std::int32_t multiplicative_offset, in gemm_q8_1_0_6_aligned()
10395 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_0_6_aligned()
10396 std::int32_t result_stride) { in gemm_q8_1_0_6_aligned()
10397 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_6_aligned()
10398 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_6_aligned()
10399 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_6_aligned()
10400 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_6_aligned()
10401 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_6_aligned()
10402 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_6_aligned()
10406 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_6_aligned()
10407 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_6_aligned()
10408 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_6_aligned()
10409 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_6_aligned()
10412 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_6_aligned()
10414 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_6_aligned()
10415 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_6_aligned()
10416 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_6_aligned()
10419 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_6_aligned()
10420 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_6_aligned()
10460 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_0_7_aligned()
10461 std::int32_t n, std::int32_t k, in gemm_q8_1_0_7_aligned()
10462 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_0_7_aligned()
10463 std::int32_t result_offset, in gemm_q8_1_0_7_aligned()
10464 std::int32_t multiplicative_offset, in gemm_q8_1_0_7_aligned()
10465 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_0_7_aligned()
10466 std::int32_t result_stride) { in gemm_q8_1_0_7_aligned()
10467 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_7_aligned()
10468 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_7_aligned()
10469 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_7_aligned()
10470 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_7_aligned()
10471 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_7_aligned()
10472 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_7_aligned()
10476 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_7_aligned()
10477 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_7_aligned()
10478 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_7_aligned()
10479 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_7_aligned()
10482 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_7_aligned()
10484 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_7_aligned()
10485 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_7_aligned()
10486 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_7_aligned()
10489 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_7_aligned()
10490 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_7_aligned()
10530 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_1_0_aligned()
10531 std::int32_t n, std::int32_t k, in gemm_q8_1_1_0_aligned()
10532 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_1_0_aligned()
10533 std::int32_t result_offset, in gemm_q8_1_1_0_aligned()
10534 std::int32_t multiplicative_offset, in gemm_q8_1_1_0_aligned()
10535 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_1_0_aligned()
10536 std::int32_t result_stride) { in gemm_q8_1_1_0_aligned()
10537 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_0_aligned()
10538 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_0_aligned()
10539 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_0_aligned()
10540 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_0_aligned()
10541 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_0_aligned()
10542 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_0_aligned()
10546 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_0_aligned()
10547 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_0_aligned()
10548 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_0_aligned()
10549 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_0_aligned()
10552 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_0_aligned()
10554 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_0_aligned()
10555 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_0_aligned()
10556 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_0_aligned()
10559 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_0_aligned()
10560 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_0_aligned()
10605 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_1_1_aligned()
10606 std::int32_t n, std::int32_t k, in gemm_q8_1_1_1_aligned()
10607 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_1_1_aligned()
10608 std::int32_t result_offset, in gemm_q8_1_1_1_aligned()
10609 std::int32_t multiplicative_offset, in gemm_q8_1_1_1_aligned()
10610 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_1_1_aligned()
10611 std::int32_t result_stride) { in gemm_q8_1_1_1_aligned()
10612 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_1_aligned()
10613 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_1_aligned()
10614 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_1_aligned()
10615 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_1_aligned()
10616 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_1_aligned()
10617 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_1_aligned()
10621 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_1_aligned()
10622 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_1_aligned()
10623 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_1_aligned()
10624 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_1_aligned()
10627 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_1_aligned()
10629 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_1_aligned()
10630 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_1_aligned()
10631 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_1_aligned()
10634 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_1_aligned()
10635 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_1_aligned()
10680 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_1_2_aligned()
10681 std::int32_t n, std::int32_t k, in gemm_q8_1_1_2_aligned()
10682 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_1_2_aligned()
10683 std::int32_t result_offset, in gemm_q8_1_1_2_aligned()
10684 std::int32_t multiplicative_offset, in gemm_q8_1_1_2_aligned()
10685 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_1_2_aligned()
10686 std::int32_t result_stride) { in gemm_q8_1_1_2_aligned()
10687 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_2_aligned()
10688 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_2_aligned()
10689 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_2_aligned()
10690 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_2_aligned()
10691 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_2_aligned()
10692 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_2_aligned()
10696 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_2_aligned()
10697 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_2_aligned()
10698 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_2_aligned()
10699 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_2_aligned()
10702 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_2_aligned()
10704 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_2_aligned()
10705 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_2_aligned()
10706 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_2_aligned()
10709 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_2_aligned()
10710 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_2_aligned()
10755 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_1_3_aligned()
10756 std::int32_t n, std::int32_t k, in gemm_q8_1_1_3_aligned()
10757 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_1_3_aligned()
10758 std::int32_t result_offset, in gemm_q8_1_1_3_aligned()
10759 std::int32_t multiplicative_offset, in gemm_q8_1_1_3_aligned()
10760 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_1_3_aligned()
10761 std::int32_t result_stride) { in gemm_q8_1_1_3_aligned()
10762 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_3_aligned()
10763 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_3_aligned()
10764 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_3_aligned()
10765 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_3_aligned()
10766 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_3_aligned()
10767 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_3_aligned()
10771 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_3_aligned()
10772 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_3_aligned()
10773 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_3_aligned()
10774 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_3_aligned()
10777 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_3_aligned()
10779 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_3_aligned()
10780 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_3_aligned()
10781 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_3_aligned()
10784 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_3_aligned()
10785 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_3_aligned()
10830 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_1_4_aligned()
10831 std::int32_t n, std::int32_t k, in gemm_q8_1_1_4_aligned()
10832 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_1_4_aligned()
10833 std::int32_t result_offset, in gemm_q8_1_1_4_aligned()
10834 std::int32_t multiplicative_offset, in gemm_q8_1_1_4_aligned()
10835 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_1_4_aligned()
10836 std::int32_t result_stride) { in gemm_q8_1_1_4_aligned()
10837 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_4_aligned()
10838 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_4_aligned()
10839 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_4_aligned()
10840 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_4_aligned()
10841 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_4_aligned()
10842 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_4_aligned()
10846 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_4_aligned()
10847 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_4_aligned()
10848 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_4_aligned()
10849 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_4_aligned()
10852 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_4_aligned()
10854 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_4_aligned()
10855 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_4_aligned()
10856 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_4_aligned()
10859 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_4_aligned()
10860 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_4_aligned()
10905 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_1_5_aligned()
10906 std::int32_t n, std::int32_t k, in gemm_q8_1_1_5_aligned()
10907 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_1_5_aligned()
10908 std::int32_t result_offset, in gemm_q8_1_1_5_aligned()
10909 std::int32_t multiplicative_offset, in gemm_q8_1_1_5_aligned()
10910 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_1_5_aligned()
10911 std::int32_t result_stride) { in gemm_q8_1_1_5_aligned()
10912 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_5_aligned()
10913 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_5_aligned()
10914 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_5_aligned()
10915 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_5_aligned()
10916 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_5_aligned()
10917 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_5_aligned()
10921 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_5_aligned()
10922 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_5_aligned()
10923 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_5_aligned()
10924 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_5_aligned()
10927 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_5_aligned()
10929 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_5_aligned()
10930 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_5_aligned()
10931 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_5_aligned()
10934 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_5_aligned()
10935 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_5_aligned()
10980 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_1_6_aligned()
10981 std::int32_t n, std::int32_t k, in gemm_q8_1_1_6_aligned()
10982 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_1_6_aligned()
10983 std::int32_t result_offset, in gemm_q8_1_1_6_aligned()
10984 std::int32_t multiplicative_offset, in gemm_q8_1_1_6_aligned()
10985 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_1_6_aligned()
10986 std::int32_t result_stride) { in gemm_q8_1_1_6_aligned()
10987 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_6_aligned()
10988 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_6_aligned()
10989 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_6_aligned()
10990 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_6_aligned()
10991 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_6_aligned()
10992 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_6_aligned()
10996 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_6_aligned()
10997 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_6_aligned()
10998 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_6_aligned()
10999 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_6_aligned()
11002 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_6_aligned()
11004 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_6_aligned()
11005 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_6_aligned()
11006 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_6_aligned()
11009 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_6_aligned()
11010 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_6_aligned()
11055 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_1_7_aligned()
11056 std::int32_t n, std::int32_t k, in gemm_q8_1_1_7_aligned()
11057 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_1_7_aligned()
11058 std::int32_t result_offset, in gemm_q8_1_1_7_aligned()
11059 std::int32_t multiplicative_offset, in gemm_q8_1_1_7_aligned()
11060 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_1_7_aligned()
11061 std::int32_t result_stride) { in gemm_q8_1_1_7_aligned()
11062 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_7_aligned()
11063 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_7_aligned()
11064 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_7_aligned()
11065 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_7_aligned()
11066 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_7_aligned()
11067 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_7_aligned()
11071 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_7_aligned()
11072 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_7_aligned()
11073 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_7_aligned()
11074 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_7_aligned()
11077 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_7_aligned()
11079 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_7_aligned()
11080 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_7_aligned()
11081 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_7_aligned()
11084 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_7_aligned()
11085 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_7_aligned()
11130 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_2_0_aligned()
11131 std::int32_t n, std::int32_t k, in gemm_q8_1_2_0_aligned()
11132 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_2_0_aligned()
11133 std::int32_t result_offset, in gemm_q8_1_2_0_aligned()
11134 std::int32_t multiplicative_offset, in gemm_q8_1_2_0_aligned()
11135 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_2_0_aligned()
11136 std::int32_t result_stride) { in gemm_q8_1_2_0_aligned()
11137 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_0_aligned()
11138 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_0_aligned()
11139 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_0_aligned()
11140 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_0_aligned()
11141 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_0_aligned()
11142 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_0_aligned()
11146 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_0_aligned()
11147 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_0_aligned()
11148 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_0_aligned()
11149 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_0_aligned()
11152 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_0_aligned()
11154 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_0_aligned()
11155 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_0_aligned()
11156 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_0_aligned()
11159 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_0_aligned()
11160 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_0_aligned()
11205 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_2_1_aligned()
11206 std::int32_t n, std::int32_t k, in gemm_q8_1_2_1_aligned()
11207 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_2_1_aligned()
11208 std::int32_t result_offset, in gemm_q8_1_2_1_aligned()
11209 std::int32_t multiplicative_offset, in gemm_q8_1_2_1_aligned()
11210 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_2_1_aligned()
11211 std::int32_t result_stride) { in gemm_q8_1_2_1_aligned()
11212 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_1_aligned()
11213 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_1_aligned()
11214 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_1_aligned()
11215 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_1_aligned()
11216 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_1_aligned()
11217 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_1_aligned()
11221 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_1_aligned()
11222 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_1_aligned()
11223 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_1_aligned()
11224 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_1_aligned()
11227 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_1_aligned()
11229 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_1_aligned()
11230 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_1_aligned()
11231 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_1_aligned()
11234 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_1_aligned()
11235 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_1_aligned()
11280 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_2_2_aligned()
11281 std::int32_t n, std::int32_t k, in gemm_q8_1_2_2_aligned()
11282 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_2_2_aligned()
11283 std::int32_t result_offset, in gemm_q8_1_2_2_aligned()
11284 std::int32_t multiplicative_offset, in gemm_q8_1_2_2_aligned()
11285 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_2_2_aligned()
11286 std::int32_t result_stride) { in gemm_q8_1_2_2_aligned()
11287 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_2_aligned()
11288 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_2_aligned()
11289 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_2_aligned()
11290 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_2_aligned()
11291 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_2_aligned()
11292 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_2_aligned()
11296 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_2_aligned()
11297 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_2_aligned()
11298 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_2_aligned()
11299 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_2_aligned()
11302 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_2_aligned()
11304 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_2_aligned()
11305 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_2_aligned()
11306 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_2_aligned()
11309 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_2_aligned()
11310 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_2_aligned()
11355 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_2_3_aligned()
11356 std::int32_t n, std::int32_t k, in gemm_q8_1_2_3_aligned()
11357 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_2_3_aligned()
11358 std::int32_t result_offset, in gemm_q8_1_2_3_aligned()
11359 std::int32_t multiplicative_offset, in gemm_q8_1_2_3_aligned()
11360 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_2_3_aligned()
11361 std::int32_t result_stride) { in gemm_q8_1_2_3_aligned()
11362 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_3_aligned()
11363 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_3_aligned()
11364 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_3_aligned()
11365 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_3_aligned()
11366 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_3_aligned()
11367 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_3_aligned()
11371 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_3_aligned()
11372 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_3_aligned()
11373 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_3_aligned()
11374 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_3_aligned()
11377 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_3_aligned()
11379 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_3_aligned()
11380 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_3_aligned()
11381 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_3_aligned()
11384 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_3_aligned()
11385 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_3_aligned()
11430 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_2_4_aligned()
11431 std::int32_t n, std::int32_t k, in gemm_q8_1_2_4_aligned()
11432 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_2_4_aligned()
11433 std::int32_t result_offset, in gemm_q8_1_2_4_aligned()
11434 std::int32_t multiplicative_offset, in gemm_q8_1_2_4_aligned()
11435 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_2_4_aligned()
11436 std::int32_t result_stride) { in gemm_q8_1_2_4_aligned()
11437 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_4_aligned()
11438 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_4_aligned()
11439 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_4_aligned()
11440 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_4_aligned()
11441 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_4_aligned()
11442 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_4_aligned()
11446 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_4_aligned()
11447 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_4_aligned()
11448 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_4_aligned()
11449 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_4_aligned()
11452 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_4_aligned()
11454 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_4_aligned()
11455 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_4_aligned()
11456 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_4_aligned()
11459 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_4_aligned()
11460 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_4_aligned()
11505 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_2_5_aligned()
11506 std::int32_t n, std::int32_t k, in gemm_q8_1_2_5_aligned()
11507 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_2_5_aligned()
11508 std::int32_t result_offset, in gemm_q8_1_2_5_aligned()
11509 std::int32_t multiplicative_offset, in gemm_q8_1_2_5_aligned()
11510 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_2_5_aligned()
11511 std::int32_t result_stride) { in gemm_q8_1_2_5_aligned()
11512 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_5_aligned()
11513 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_5_aligned()
11514 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_5_aligned()
11515 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_5_aligned()
11516 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_5_aligned()
11517 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_5_aligned()
11521 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_5_aligned()
11522 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_5_aligned()
11523 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_5_aligned()
11524 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_5_aligned()
11527 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_5_aligned()
11529 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_5_aligned()
11530 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_5_aligned()
11531 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_5_aligned()
11534 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_5_aligned()
11535 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_5_aligned()
11580 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_2_6_aligned()
11581 std::int32_t n, std::int32_t k, in gemm_q8_1_2_6_aligned()
11582 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_2_6_aligned()
11583 std::int32_t result_offset, in gemm_q8_1_2_6_aligned()
11584 std::int32_t multiplicative_offset, in gemm_q8_1_2_6_aligned()
11585 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_2_6_aligned()
11586 std::int32_t result_stride) { in gemm_q8_1_2_6_aligned()
11587 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_6_aligned()
11588 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_6_aligned()
11589 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_6_aligned()
11590 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_6_aligned()
11591 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_6_aligned()
11592 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_6_aligned()
11596 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_6_aligned()
11597 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_6_aligned()
11598 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_6_aligned()
11599 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_6_aligned()
11602 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_6_aligned()
11604 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_6_aligned()
11605 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_6_aligned()
11606 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_6_aligned()
11609 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_6_aligned()
11610 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_6_aligned()
11655 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_1_2_7_aligned()
11656 std::int32_t n, std::int32_t k, in gemm_q8_1_2_7_aligned()
11657 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_1_2_7_aligned()
11658 std::int32_t result_offset, in gemm_q8_1_2_7_aligned()
11659 std::int32_t multiplicative_offset, in gemm_q8_1_2_7_aligned()
11660 std::int32_t shift, std::uint8_t* result, in gemm_q8_1_2_7_aligned()
11661 std::int32_t result_stride) { in gemm_q8_1_2_7_aligned()
11662 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_7_aligned()
11663 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_7_aligned()
11664 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_7_aligned()
11665 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_7_aligned()
11666 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_7_aligned()
11667 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_7_aligned()
11671 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_7_aligned()
11672 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_7_aligned()
11673 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_7_aligned()
11674 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_7_aligned()
11677 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_7_aligned()
11679 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_7_aligned()
11680 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_7_aligned()
11681 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_7_aligned()
11684 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_7_aligned()
11685 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_7_aligned()
11730 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_0_0_aligned()
11731 std::int32_t n, std::int32_t k, in gemm_q8_2_0_0_aligned()
11732 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_0_0_aligned()
11733 std::int32_t result_offset, in gemm_q8_2_0_0_aligned()
11734 std::int32_t multiplicative_offset, in gemm_q8_2_0_0_aligned()
11735 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_0_0_aligned()
11736 std::int32_t result_stride) { in gemm_q8_2_0_0_aligned()
11737 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_0_aligned()
11738 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_0_aligned()
11739 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_0_aligned()
11740 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_0_aligned()
11741 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_0_aligned()
11742 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_0_aligned()
11746 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_0_aligned()
11747 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_0_aligned()
11748 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_0_aligned()
11749 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_0_aligned()
11752 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_0_aligned()
11754 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_0_aligned()
11755 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_0_aligned()
11756 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_0_aligned()
11759 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_0_aligned()
11760 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_0_aligned()
11800 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_0_1_aligned()
11801 std::int32_t n, std::int32_t k, in gemm_q8_2_0_1_aligned()
11802 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_0_1_aligned()
11803 std::int32_t result_offset, in gemm_q8_2_0_1_aligned()
11804 std::int32_t multiplicative_offset, in gemm_q8_2_0_1_aligned()
11805 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_0_1_aligned()
11806 std::int32_t result_stride) { in gemm_q8_2_0_1_aligned()
11807 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_1_aligned()
11808 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_1_aligned()
11809 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_1_aligned()
11810 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_1_aligned()
11811 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_1_aligned()
11812 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_1_aligned()
11816 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_1_aligned()
11817 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_1_aligned()
11818 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_1_aligned()
11819 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_1_aligned()
11822 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_1_aligned()
11824 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_1_aligned()
11825 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_1_aligned()
11826 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_1_aligned()
11829 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_1_aligned()
11830 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_1_aligned()
11870 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_0_2_aligned()
11871 std::int32_t n, std::int32_t k, in gemm_q8_2_0_2_aligned()
11872 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_0_2_aligned()
11873 std::int32_t result_offset, in gemm_q8_2_0_2_aligned()
11874 std::int32_t multiplicative_offset, in gemm_q8_2_0_2_aligned()
11875 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_0_2_aligned()
11876 std::int32_t result_stride) { in gemm_q8_2_0_2_aligned()
11877 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_2_aligned()
11878 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_2_aligned()
11879 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_2_aligned()
11880 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_2_aligned()
11881 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_2_aligned()
11882 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_2_aligned()
11886 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_2_aligned()
11887 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_2_aligned()
11888 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_2_aligned()
11889 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_2_aligned()
11892 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_2_aligned()
11894 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_2_aligned()
11895 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_2_aligned()
11896 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_2_aligned()
11899 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_2_aligned()
11900 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_2_aligned()
11940 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_0_3_aligned()
11941 std::int32_t n, std::int32_t k, in gemm_q8_2_0_3_aligned()
11942 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_0_3_aligned()
11943 std::int32_t result_offset, in gemm_q8_2_0_3_aligned()
11944 std::int32_t multiplicative_offset, in gemm_q8_2_0_3_aligned()
11945 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_0_3_aligned()
11946 std::int32_t result_stride) { in gemm_q8_2_0_3_aligned()
11947 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_3_aligned()
11948 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_3_aligned()
11949 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_3_aligned()
11950 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_3_aligned()
11951 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_3_aligned()
11952 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_3_aligned()
11956 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_3_aligned()
11957 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_3_aligned()
11958 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_3_aligned()
11959 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_3_aligned()
11962 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_3_aligned()
11964 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_3_aligned()
11965 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_3_aligned()
11966 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_3_aligned()
11969 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_3_aligned()
11970 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_3_aligned()
12010 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_0_4_aligned()
12011 std::int32_t n, std::int32_t k, in gemm_q8_2_0_4_aligned()
12012 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_0_4_aligned()
12013 std::int32_t result_offset, in gemm_q8_2_0_4_aligned()
12014 std::int32_t multiplicative_offset, in gemm_q8_2_0_4_aligned()
12015 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_0_4_aligned()
12016 std::int32_t result_stride) { in gemm_q8_2_0_4_aligned()
12017 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_4_aligned()
12018 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_4_aligned()
12019 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_4_aligned()
12020 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_4_aligned()
12021 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_4_aligned()
12022 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_4_aligned()
12026 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_4_aligned()
12027 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_4_aligned()
12028 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_4_aligned()
12029 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_4_aligned()
12032 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_4_aligned()
12034 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_4_aligned()
12035 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_4_aligned()
12036 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_4_aligned()
12039 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_4_aligned()
12040 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_4_aligned()
12080 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_0_5_aligned()
12081 std::int32_t n, std::int32_t k, in gemm_q8_2_0_5_aligned()
12082 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_0_5_aligned()
12083 std::int32_t result_offset, in gemm_q8_2_0_5_aligned()
12084 std::int32_t multiplicative_offset, in gemm_q8_2_0_5_aligned()
12085 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_0_5_aligned()
12086 std::int32_t result_stride) { in gemm_q8_2_0_5_aligned()
12087 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_5_aligned()
12088 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_5_aligned()
12089 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_5_aligned()
12090 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_5_aligned()
12091 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_5_aligned()
12092 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_5_aligned()
12096 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_5_aligned()
12097 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_5_aligned()
12098 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_5_aligned()
12099 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_5_aligned()
12102 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_5_aligned()
12104 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_5_aligned()
12105 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_5_aligned()
12106 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_5_aligned()
12109 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_5_aligned()
12110 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_5_aligned()
12150 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_0_6_aligned()
12151 std::int32_t n, std::int32_t k, in gemm_q8_2_0_6_aligned()
12152 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_0_6_aligned()
12153 std::int32_t result_offset, in gemm_q8_2_0_6_aligned()
12154 std::int32_t multiplicative_offset, in gemm_q8_2_0_6_aligned()
12155 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_0_6_aligned()
12156 std::int32_t result_stride) { in gemm_q8_2_0_6_aligned()
12157 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_6_aligned()
12158 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_6_aligned()
12159 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_6_aligned()
12160 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_6_aligned()
12161 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_6_aligned()
12162 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_6_aligned()
12166 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_6_aligned()
12167 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_6_aligned()
12168 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_6_aligned()
12169 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_6_aligned()
12172 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_6_aligned()
12174 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_6_aligned()
12175 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_6_aligned()
12176 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_6_aligned()
12179 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_6_aligned()
12180 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_6_aligned()
12220 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_0_7_aligned()
12221 std::int32_t n, std::int32_t k, in gemm_q8_2_0_7_aligned()
12222 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_0_7_aligned()
12223 std::int32_t result_offset, in gemm_q8_2_0_7_aligned()
12224 std::int32_t multiplicative_offset, in gemm_q8_2_0_7_aligned()
12225 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_0_7_aligned()
12226 std::int32_t result_stride) { in gemm_q8_2_0_7_aligned()
12227 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_7_aligned()
12228 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_7_aligned()
12229 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_7_aligned()
12230 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_7_aligned()
12231 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_7_aligned()
12232 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_7_aligned()
12236 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_7_aligned()
12237 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_7_aligned()
12238 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_7_aligned()
12239 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_7_aligned()
12242 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_7_aligned()
12244 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_7_aligned()
12245 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_7_aligned()
12246 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_7_aligned()
12249 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_7_aligned()
12250 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_7_aligned()
12290 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_1_0_aligned()
12291 std::int32_t n, std::int32_t k, in gemm_q8_2_1_0_aligned()
12292 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_1_0_aligned()
12293 std::int32_t result_offset, in gemm_q8_2_1_0_aligned()
12294 std::int32_t multiplicative_offset, in gemm_q8_2_1_0_aligned()
12295 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_1_0_aligned()
12296 std::int32_t result_stride) { in gemm_q8_2_1_0_aligned()
12297 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_0_aligned()
12298 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_0_aligned()
12299 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_0_aligned()
12300 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_0_aligned()
12301 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_0_aligned()
12302 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_0_aligned()
12306 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_0_aligned()
12307 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_0_aligned()
12308 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_0_aligned()
12309 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_0_aligned()
12312 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_0_aligned()
12314 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_0_aligned()
12315 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_0_aligned()
12316 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_0_aligned()
12319 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_0_aligned()
12320 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_0_aligned()
12365 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_1_1_aligned()
12366 std::int32_t n, std::int32_t k, in gemm_q8_2_1_1_aligned()
12367 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_1_1_aligned()
12368 std::int32_t result_offset, in gemm_q8_2_1_1_aligned()
12369 std::int32_t multiplicative_offset, in gemm_q8_2_1_1_aligned()
12370 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_1_1_aligned()
12371 std::int32_t result_stride) { in gemm_q8_2_1_1_aligned()
12372 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_1_aligned()
12373 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_1_aligned()
12374 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_1_aligned()
12375 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_1_aligned()
12376 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_1_aligned()
12377 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_1_aligned()
12381 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_1_aligned()
12382 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_1_aligned()
12383 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_1_aligned()
12384 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_1_aligned()
12387 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_1_aligned()
12389 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_1_aligned()
12390 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_1_aligned()
12391 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_1_aligned()
12394 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_1_aligned()
12395 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_1_aligned()
12440 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_1_2_aligned()
12441 std::int32_t n, std::int32_t k, in gemm_q8_2_1_2_aligned()
12442 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_1_2_aligned()
12443 std::int32_t result_offset, in gemm_q8_2_1_2_aligned()
12444 std::int32_t multiplicative_offset, in gemm_q8_2_1_2_aligned()
12445 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_1_2_aligned()
12446 std::int32_t result_stride) { in gemm_q8_2_1_2_aligned()
12447 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_2_aligned()
12448 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_2_aligned()
12449 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_2_aligned()
12450 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_2_aligned()
12451 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_2_aligned()
12452 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_2_aligned()
12456 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_2_aligned()
12457 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_2_aligned()
12458 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_2_aligned()
12459 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_2_aligned()
12462 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_2_aligned()
12464 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_2_aligned()
12465 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_2_aligned()
12466 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_2_aligned()
12469 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_2_aligned()
12470 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_2_aligned()
12515 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_1_3_aligned()
12516 std::int32_t n, std::int32_t k, in gemm_q8_2_1_3_aligned()
12517 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_1_3_aligned()
12518 std::int32_t result_offset, in gemm_q8_2_1_3_aligned()
12519 std::int32_t multiplicative_offset, in gemm_q8_2_1_3_aligned()
12520 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_1_3_aligned()
12521 std::int32_t result_stride) { in gemm_q8_2_1_3_aligned()
12522 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_3_aligned()
12523 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_3_aligned()
12524 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_3_aligned()
12525 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_3_aligned()
12526 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_3_aligned()
12527 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_3_aligned()
12531 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_3_aligned()
12532 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_3_aligned()
12533 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_3_aligned()
12534 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_3_aligned()
12537 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_3_aligned()
12539 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_3_aligned()
12540 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_3_aligned()
12541 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_3_aligned()
12544 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_3_aligned()
12545 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_3_aligned()
12590 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_1_4_aligned()
12591 std::int32_t n, std::int32_t k, in gemm_q8_2_1_4_aligned()
12592 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_1_4_aligned()
12593 std::int32_t result_offset, in gemm_q8_2_1_4_aligned()
12594 std::int32_t multiplicative_offset, in gemm_q8_2_1_4_aligned()
12595 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_1_4_aligned()
12596 std::int32_t result_stride) { in gemm_q8_2_1_4_aligned()
12597 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_4_aligned()
12598 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_4_aligned()
12599 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_4_aligned()
12600 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_4_aligned()
12601 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_4_aligned()
12602 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_4_aligned()
12606 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_4_aligned()
12607 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_4_aligned()
12608 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_4_aligned()
12609 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_4_aligned()
12612 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_4_aligned()
12614 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_4_aligned()
12615 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_4_aligned()
12616 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_4_aligned()
12619 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_4_aligned()
12620 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_4_aligned()
12665 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_1_5_aligned()
12666 std::int32_t n, std::int32_t k, in gemm_q8_2_1_5_aligned()
12667 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_1_5_aligned()
12668 std::int32_t result_offset, in gemm_q8_2_1_5_aligned()
12669 std::int32_t multiplicative_offset, in gemm_q8_2_1_5_aligned()
12670 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_1_5_aligned()
12671 std::int32_t result_stride) { in gemm_q8_2_1_5_aligned()
12672 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_5_aligned()
12673 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_5_aligned()
12674 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_5_aligned()
12675 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_5_aligned()
12676 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_5_aligned()
12677 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_5_aligned()
12681 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_5_aligned()
12682 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_5_aligned()
12683 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_5_aligned()
12684 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_5_aligned()
12687 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_5_aligned()
12689 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_5_aligned()
12690 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_5_aligned()
12691 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_5_aligned()
12694 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_5_aligned()
12695 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_5_aligned()
12740 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_1_6_aligned()
12741 std::int32_t n, std::int32_t k, in gemm_q8_2_1_6_aligned()
12742 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_1_6_aligned()
12743 std::int32_t result_offset, in gemm_q8_2_1_6_aligned()
12744 std::int32_t multiplicative_offset, in gemm_q8_2_1_6_aligned()
12745 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_1_6_aligned()
12746 std::int32_t result_stride) { in gemm_q8_2_1_6_aligned()
12747 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_6_aligned()
12748 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_6_aligned()
12749 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_6_aligned()
12750 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_6_aligned()
12751 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_6_aligned()
12752 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_6_aligned()
12756 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_6_aligned()
12757 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_6_aligned()
12758 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_6_aligned()
12759 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_6_aligned()
12762 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_6_aligned()
12764 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_6_aligned()
12765 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_6_aligned()
12766 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_6_aligned()
12769 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_6_aligned()
12770 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_6_aligned()
12815 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_1_7_aligned()
12816 std::int32_t n, std::int32_t k, in gemm_q8_2_1_7_aligned()
12817 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_1_7_aligned()
12818 std::int32_t result_offset, in gemm_q8_2_1_7_aligned()
12819 std::int32_t multiplicative_offset, in gemm_q8_2_1_7_aligned()
12820 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_1_7_aligned()
12821 std::int32_t result_stride) { in gemm_q8_2_1_7_aligned()
12822 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_7_aligned()
12823 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_7_aligned()
12824 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_7_aligned()
12825 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_7_aligned()
12826 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_7_aligned()
12827 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_7_aligned()
12831 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_7_aligned()
12832 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_7_aligned()
12833 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_7_aligned()
12834 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_7_aligned()
12837 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_7_aligned()
12839 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_7_aligned()
12840 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_7_aligned()
12841 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_7_aligned()
12844 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_7_aligned()
12845 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_7_aligned()
12890 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_2_0_aligned()
12891 std::int32_t n, std::int32_t k, in gemm_q8_2_2_0_aligned()
12892 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_2_0_aligned()
12893 std::int32_t result_offset, in gemm_q8_2_2_0_aligned()
12894 std::int32_t multiplicative_offset, in gemm_q8_2_2_0_aligned()
12895 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_2_0_aligned()
12896 std::int32_t result_stride) { in gemm_q8_2_2_0_aligned()
12897 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_0_aligned()
12898 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_0_aligned()
12899 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_0_aligned()
12900 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_0_aligned()
12901 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_0_aligned()
12902 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_0_aligned()
12906 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_0_aligned()
12907 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_0_aligned()
12908 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_0_aligned()
12909 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_0_aligned()
12912 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_0_aligned()
12914 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_0_aligned()
12915 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_0_aligned()
12916 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_0_aligned()
12919 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_0_aligned()
12920 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_0_aligned()
12965 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_2_1_aligned()
12966 std::int32_t n, std::int32_t k, in gemm_q8_2_2_1_aligned()
12967 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_2_1_aligned()
12968 std::int32_t result_offset, in gemm_q8_2_2_1_aligned()
12969 std::int32_t multiplicative_offset, in gemm_q8_2_2_1_aligned()
12970 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_2_1_aligned()
12971 std::int32_t result_stride) { in gemm_q8_2_2_1_aligned()
12972 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_1_aligned()
12973 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_1_aligned()
12974 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_1_aligned()
12975 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_1_aligned()
12976 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_1_aligned()
12977 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_1_aligned()
12981 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_1_aligned()
12982 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_1_aligned()
12983 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_1_aligned()
12984 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_1_aligned()
12987 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_1_aligned()
12989 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_1_aligned()
12990 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_1_aligned()
12991 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_1_aligned()
12994 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_1_aligned()
12995 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_1_aligned()
13040 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_2_2_aligned()
13041 std::int32_t n, std::int32_t k, in gemm_q8_2_2_2_aligned()
13042 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_2_2_aligned()
13043 std::int32_t result_offset, in gemm_q8_2_2_2_aligned()
13044 std::int32_t multiplicative_offset, in gemm_q8_2_2_2_aligned()
13045 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_2_2_aligned()
13046 std::int32_t result_stride) { in gemm_q8_2_2_2_aligned()
13047 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_2_aligned()
13048 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_2_aligned()
13049 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_2_aligned()
13050 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_2_aligned()
13051 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_2_aligned()
13052 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_2_aligned()
13056 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_2_aligned()
13057 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_2_aligned()
13058 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_2_aligned()
13059 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_2_aligned()
13062 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_2_aligned()
13064 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_2_aligned()
13065 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_2_aligned()
13066 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_2_aligned()
13069 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_2_aligned()
13070 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_2_aligned()
13115 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_2_3_aligned()
13116 std::int32_t n, std::int32_t k, in gemm_q8_2_2_3_aligned()
13117 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_2_3_aligned()
13118 std::int32_t result_offset, in gemm_q8_2_2_3_aligned()
13119 std::int32_t multiplicative_offset, in gemm_q8_2_2_3_aligned()
13120 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_2_3_aligned()
13121 std::int32_t result_stride) { in gemm_q8_2_2_3_aligned()
13122 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_3_aligned()
13123 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_3_aligned()
13124 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_3_aligned()
13125 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_3_aligned()
13126 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_3_aligned()
13127 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_3_aligned()
13131 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_3_aligned()
13132 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_3_aligned()
13133 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_3_aligned()
13134 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_3_aligned()
13137 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_3_aligned()
13139 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_3_aligned()
13140 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_3_aligned()
13141 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_3_aligned()
13144 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_3_aligned()
13145 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_3_aligned()
13190 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_2_4_aligned()
13191 std::int32_t n, std::int32_t k, in gemm_q8_2_2_4_aligned()
13192 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_2_4_aligned()
13193 std::int32_t result_offset, in gemm_q8_2_2_4_aligned()
13194 std::int32_t multiplicative_offset, in gemm_q8_2_2_4_aligned()
13195 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_2_4_aligned()
13196 std::int32_t result_stride) { in gemm_q8_2_2_4_aligned()
13197 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_4_aligned()
13198 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_4_aligned()
13199 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_4_aligned()
13200 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_4_aligned()
13201 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_4_aligned()
13202 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_4_aligned()
13206 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_4_aligned()
13207 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_4_aligned()
13208 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_4_aligned()
13209 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_4_aligned()
13212 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_4_aligned()
13214 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_4_aligned()
13215 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_4_aligned()
13216 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_4_aligned()
13219 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_4_aligned()
13220 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_4_aligned()
13265 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_2_5_aligned()
13266 std::int32_t n, std::int32_t k, in gemm_q8_2_2_5_aligned()
13267 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_2_5_aligned()
13268 std::int32_t result_offset, in gemm_q8_2_2_5_aligned()
13269 std::int32_t multiplicative_offset, in gemm_q8_2_2_5_aligned()
13270 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_2_5_aligned()
13271 std::int32_t result_stride) { in gemm_q8_2_2_5_aligned()
13272 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_5_aligned()
13273 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_5_aligned()
13274 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_5_aligned()
13275 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_5_aligned()
13276 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_5_aligned()
13277 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_5_aligned()
13281 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_5_aligned()
13282 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_5_aligned()
13283 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_5_aligned()
13284 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_5_aligned()
13287 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_5_aligned()
13289 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_5_aligned()
13290 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_5_aligned()
13291 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_5_aligned()
13294 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_5_aligned()
13295 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_5_aligned()
13340 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_2_6_aligned()
13341 std::int32_t n, std::int32_t k, in gemm_q8_2_2_6_aligned()
13342 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_2_6_aligned()
13343 std::int32_t result_offset, in gemm_q8_2_2_6_aligned()
13344 std::int32_t multiplicative_offset, in gemm_q8_2_2_6_aligned()
13345 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_2_6_aligned()
13346 std::int32_t result_stride) { in gemm_q8_2_2_6_aligned()
13347 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_6_aligned()
13348 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_6_aligned()
13349 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_6_aligned()
13350 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_6_aligned()
13351 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_6_aligned()
13352 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_6_aligned()
13356 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_6_aligned()
13357 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_6_aligned()
13358 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_6_aligned()
13359 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_6_aligned()
13362 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_6_aligned()
13364 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_6_aligned()
13365 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_6_aligned()
13366 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_6_aligned()
13369 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_6_aligned()
13370 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_6_aligned()
13415 const std::uint8_t* rhs, std::int32_t m, in gemm_q8_2_2_7_aligned()
13416 std::int32_t n, std::int32_t k, in gemm_q8_2_2_7_aligned()
13417 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8_2_2_7_aligned()
13418 std::int32_t result_offset, in gemm_q8_2_2_7_aligned()
13419 std::int32_t multiplicative_offset, in gemm_q8_2_2_7_aligned()
13420 std::int32_t shift, std::uint8_t* result, in gemm_q8_2_2_7_aligned()
13421 std::int32_t result_stride) { in gemm_q8_2_2_7_aligned()
13422 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_7_aligned()
13423 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_7_aligned()
13424 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_7_aligned()
13425 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_7_aligned()
13426 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_7_aligned()
13427 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_7_aligned()
13431 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_7_aligned()
13432 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_7_aligned()
13433 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_7_aligned()
13434 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_7_aligned()
13437 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_7_aligned()
13439 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_7_aligned()
13440 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_7_aligned()
13441 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_7_aligned()
13444 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_7_aligned()
13445 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_7_aligned()
13490 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_0_0()
13491 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_0_0()
13492 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_0_0()
13493 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_0_0()
13494 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_0_0()
13495 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_0()
13496 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_0()
13497 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_0()
13498 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_0()
13499 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_0()
13500 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_0()
13504 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_0()
13505 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_0()
13508 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_0()
13510 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_0()
13511 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_0()
13512 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_0()
13515 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_0()
13516 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_0()
13543 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_0_1()
13544 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_0_1()
13545 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_0_1()
13546 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_0_1()
13547 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_0_1()
13548 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_1()
13549 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_1()
13550 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_1()
13551 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_1()
13552 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_1()
13553 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_1()
13557 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_1()
13558 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_1()
13561 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_1()
13563 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_1()
13564 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_1()
13565 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_1()
13568 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_1()
13569 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_1()
13596 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_0_2()
13597 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_0_2()
13598 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_0_2()
13599 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_0_2()
13600 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_0_2()
13601 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_2()
13602 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_2()
13603 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_2()
13604 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_2()
13605 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_2()
13606 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_2()
13610 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_2()
13611 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_2()
13614 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_2()
13616 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_2()
13617 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_2()
13618 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_2()
13621 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_2()
13622 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_2()
13649 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_0_3()
13650 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_0_3()
13651 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_0_3()
13652 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_0_3()
13653 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_0_3()
13654 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_3()
13655 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_3()
13656 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_3()
13657 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_3()
13658 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_3()
13659 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_3()
13663 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_3()
13664 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_3()
13667 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_3()
13669 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_3()
13670 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_3()
13671 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_3()
13674 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_3()
13675 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_3()
13702 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_0_4()
13703 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_0_4()
13704 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_0_4()
13705 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_0_4()
13706 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_0_4()
13707 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_4()
13708 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_4()
13709 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_4()
13710 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_4()
13711 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_4()
13712 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_4()
13716 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_4()
13717 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_4()
13720 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_4()
13722 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_4()
13723 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_4()
13724 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_4()
13727 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_4()
13728 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_4()
13755 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_0_5()
13756 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_0_5()
13757 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_0_5()
13758 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_0_5()
13759 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_0_5()
13760 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_5()
13761 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_5()
13762 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_5()
13763 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_5()
13764 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_5()
13765 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_5()
13769 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_5()
13770 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_5()
13773 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_5()
13775 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_5()
13776 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_5()
13777 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_5()
13780 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_5()
13781 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_5()
13808 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_0_6()
13809 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_0_6()
13810 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_0_6()
13811 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_0_6()
13812 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_0_6()
13813 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_6()
13814 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_6()
13815 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_6()
13816 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_6()
13817 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_6()
13818 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_6()
13822 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_6()
13823 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_6()
13826 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_6()
13828 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_6()
13829 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_6()
13830 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_6()
13833 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_6()
13834 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_6()
13861 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_0_7()
13862 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_0_7()
13863 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_0_7()
13864 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_0_7()
13865 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_0_7()
13866 const std::int32_t row_chunks = m / 3; in gemm_q8_0_0_7()
13867 const std::int32_t col_chunks = n / 3; in gemm_q8_0_0_7()
13868 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_0_7()
13869 const std::int32_t chunk_size = k * 3; in gemm_q8_0_0_7()
13870 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_0_7()
13871 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_0_7()
13875 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_0_7()
13876 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_0_7()
13879 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_0_7()
13881 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_0_7()
13882 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_0_7()
13883 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_0_7()
13886 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_0_7()
13887 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_0_7()
13914 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_1_0()
13915 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_1_0()
13916 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_1_0()
13917 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_1_0()
13918 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_1_0()
13919 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_0()
13920 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_0()
13921 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_0()
13922 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_0()
13923 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_0()
13924 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_0()
13928 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_0()
13929 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_0()
13932 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_0()
13934 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_0()
13935 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_0()
13936 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_0()
13939 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_0()
13940 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_0()
13970 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_1_1()
13971 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_1_1()
13972 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_1_1()
13973 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_1_1()
13974 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_1_1()
13975 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_1()
13976 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_1()
13977 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_1()
13978 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_1()
13979 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_1()
13980 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_1()
13984 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_1()
13985 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_1()
13988 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_1()
13990 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_1()
13991 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_1()
13992 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_1()
13995 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_1()
13996 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_1()
14026 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_1_2()
14027 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_1_2()
14028 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_1_2()
14029 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_1_2()
14030 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_1_2()
14031 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_2()
14032 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_2()
14033 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_2()
14034 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_2()
14035 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_2()
14036 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_2()
14040 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_2()
14041 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_2()
14044 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_2()
14046 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_2()
14047 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_2()
14048 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_2()
14051 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_2()
14052 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_2()
14082 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_1_3()
14083 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_1_3()
14084 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_1_3()
14085 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_1_3()
14086 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_1_3()
14087 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_3()
14088 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_3()
14089 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_3()
14090 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_3()
14091 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_3()
14092 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_3()
14096 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_3()
14097 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_3()
14100 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_3()
14102 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_3()
14103 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_3()
14104 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_3()
14107 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_3()
14108 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_3()
14138 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_1_4()
14139 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_1_4()
14140 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_1_4()
14141 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_1_4()
14142 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_1_4()
14143 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_4()
14144 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_4()
14145 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_4()
14146 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_4()
14147 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_4()
14148 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_4()
14152 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_4()
14153 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_4()
14156 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_4()
14158 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_4()
14159 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_4()
14160 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_4()
14163 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_4()
14164 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_4()
14194 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_1_5()
14195 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_1_5()
14196 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_1_5()
14197 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_1_5()
14198 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_1_5()
14199 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_5()
14200 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_5()
14201 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_5()
14202 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_5()
14203 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_5()
14204 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_5()
14208 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_5()
14209 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_5()
14212 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_5()
14214 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_5()
14215 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_5()
14216 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_5()
14219 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_5()
14220 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_5()
14250 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_1_6()
14251 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_1_6()
14252 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_1_6()
14253 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_1_6()
14254 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_1_6()
14255 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_6()
14256 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_6()
14257 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_6()
14258 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_6()
14259 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_6()
14260 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_6()
14264 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_6()
14265 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_6()
14268 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_6()
14270 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_6()
14271 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_6()
14272 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_6()
14275 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_6()
14276 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_6()
14306 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_1_7()
14307 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_1_7()
14308 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_1_7()
14309 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_1_7()
14310 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_1_7()
14311 const std::int32_t row_chunks = m / 3; in gemm_q8_0_1_7()
14312 const std::int32_t col_chunks = n / 3; in gemm_q8_0_1_7()
14313 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_1_7()
14314 const std::int32_t chunk_size = k * 3; in gemm_q8_0_1_7()
14315 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_1_7()
14316 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_1_7()
14320 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_1_7()
14321 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_1_7()
14324 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_1_7()
14326 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_1_7()
14327 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_1_7()
14328 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_1_7()
14331 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_1_7()
14332 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_1_7()
14362 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_2_0()
14363 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_2_0()
14364 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_2_0()
14365 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_2_0()
14366 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_2_0()
14367 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_0()
14368 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_0()
14369 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_0()
14370 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_0()
14371 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_0()
14372 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_0()
14376 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_0()
14377 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_0()
14380 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_0()
14382 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_0()
14383 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_0()
14384 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_0()
14387 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_0()
14388 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_0()
14418 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_2_1()
14419 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_2_1()
14420 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_2_1()
14421 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_2_1()
14422 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_2_1()
14423 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_1()
14424 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_1()
14425 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_1()
14426 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_1()
14427 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_1()
14428 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_1()
14432 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_1()
14433 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_1()
14436 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_1()
14438 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_1()
14439 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_1()
14440 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_1()
14443 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_1()
14444 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_1()
14474 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_2_2()
14475 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_2_2()
14476 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_2_2()
14477 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_2_2()
14478 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_2_2()
14479 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_2()
14480 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_2()
14481 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_2()
14482 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_2()
14483 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_2()
14484 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_2()
14488 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_2()
14489 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_2()
14492 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_2()
14494 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_2()
14495 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_2()
14496 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_2()
14499 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_2()
14500 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_2()
14530 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_2_3()
14531 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_2_3()
14532 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_2_3()
14533 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_2_3()
14534 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_2_3()
14535 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_3()
14536 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_3()
14537 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_3()
14538 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_3()
14539 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_3()
14540 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_3()
14544 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_3()
14545 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_3()
14548 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_3()
14550 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_3()
14551 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_3()
14552 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_3()
14555 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_3()
14556 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_3()
14586 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_2_4()
14587 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_2_4()
14588 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_2_4()
14589 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_2_4()
14590 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_2_4()
14591 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_4()
14592 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_4()
14593 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_4()
14594 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_4()
14595 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_4()
14596 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_4()
14600 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_4()
14601 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_4()
14604 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_4()
14606 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_4()
14607 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_4()
14608 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_4()
14611 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_4()
14612 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_4()
14642 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_2_5()
14643 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_2_5()
14644 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_2_5()
14645 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_2_5()
14646 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_2_5()
14647 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_5()
14648 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_5()
14649 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_5()
14650 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_5()
14651 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_5()
14652 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_5()
14656 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_5()
14657 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_5()
14660 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_5()
14662 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_5()
14663 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_5()
14664 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_5()
14667 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_5()
14668 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_5()
14698 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_2_6()
14699 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_2_6()
14700 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_2_6()
14701 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_2_6()
14702 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_2_6()
14703 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_6()
14704 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_6()
14705 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_6()
14706 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_6()
14707 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_6()
14708 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_6()
14712 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_6()
14713 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_6()
14716 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_6()
14718 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_6()
14719 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_6()
14720 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_6()
14723 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_6()
14724 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_6()
14754 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_0_2_7()
14755 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_0_2_7()
14756 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_0_2_7()
14757 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_0_2_7()
14758 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_0_2_7()
14759 const std::int32_t row_chunks = m / 3; in gemm_q8_0_2_7()
14760 const std::int32_t col_chunks = n / 3; in gemm_q8_0_2_7()
14761 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_0_2_7()
14762 const std::int32_t chunk_size = k * 3; in gemm_q8_0_2_7()
14763 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_0_2_7()
14764 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_0_2_7()
14768 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_0_2_7()
14769 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_0_2_7()
14772 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_0_2_7()
14774 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_0_2_7()
14775 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_0_2_7()
14776 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_0_2_7()
14779 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_0_2_7()
14780 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_0_2_7()
14810 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_0_0()
14811 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_0_0()
14812 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_0_0()
14813 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_0_0()
14814 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_0_0()
14815 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_0()
14816 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_0()
14817 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_0()
14818 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_0()
14819 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_0()
14820 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_0()
14824 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_0()
14825 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_0()
14826 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_0()
14827 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_0()
14830 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_0()
14832 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_0()
14833 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_0()
14834 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_0()
14837 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_0()
14838 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_0()
14878 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_0_1()
14879 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_0_1()
14880 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_0_1()
14881 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_0_1()
14882 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_0_1()
14883 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_1()
14884 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_1()
14885 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_1()
14886 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_1()
14887 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_1()
14888 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_1()
14892 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_1()
14893 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_1()
14894 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_1()
14895 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_1()
14898 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_1()
14900 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_1()
14901 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_1()
14902 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_1()
14905 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_1()
14906 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_1()
14946 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_0_2()
14947 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_0_2()
14948 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_0_2()
14949 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_0_2()
14950 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_0_2()
14951 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_2()
14952 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_2()
14953 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_2()
14954 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_2()
14955 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_2()
14956 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_2()
14960 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_2()
14961 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_2()
14962 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_2()
14963 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_2()
14966 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_2()
14968 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_2()
14969 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_2()
14970 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_2()
14973 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_2()
14974 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_2()
15014 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_0_3()
15015 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_0_3()
15016 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_0_3()
15017 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_0_3()
15018 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_0_3()
15019 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_3()
15020 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_3()
15021 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_3()
15022 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_3()
15023 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_3()
15024 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_3()
15028 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_3()
15029 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_3()
15030 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_3()
15031 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_3()
15034 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_3()
15036 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_3()
15037 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_3()
15038 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_3()
15041 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_3()
15042 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_3()
15082 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_0_4()
15083 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_0_4()
15084 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_0_4()
15085 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_0_4()
15086 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_0_4()
15087 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_4()
15088 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_4()
15089 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_4()
15090 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_4()
15091 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_4()
15092 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_4()
15096 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_4()
15097 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_4()
15098 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_4()
15099 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_4()
15102 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_4()
15104 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_4()
15105 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_4()
15106 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_4()
15109 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_4()
15110 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_4()
15150 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_0_5()
15151 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_0_5()
15152 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_0_5()
15153 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_0_5()
15154 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_0_5()
15155 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_5()
15156 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_5()
15157 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_5()
15158 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_5()
15159 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_5()
15160 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_5()
15164 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_5()
15165 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_5()
15166 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_5()
15167 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_5()
15170 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_5()
15172 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_5()
15173 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_5()
15174 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_5()
15177 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_5()
15178 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_5()
15218 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_0_6()
15219 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_0_6()
15220 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_0_6()
15221 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_0_6()
15222 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_0_6()
15223 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_6()
15224 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_6()
15225 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_6()
15226 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_6()
15227 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_6()
15228 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_6()
15232 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_6()
15233 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_6()
15234 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_6()
15235 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_6()
15238 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_6()
15240 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_6()
15241 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_6()
15242 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_6()
15245 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_6()
15246 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_6()
15286 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_0_7()
15287 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_0_7()
15288 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_0_7()
15289 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_0_7()
15290 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_0_7()
15291 const std::int32_t row_chunks = m / 3; in gemm_q8_1_0_7()
15292 const std::int32_t col_chunks = n / 3; in gemm_q8_1_0_7()
15293 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_0_7()
15294 const std::int32_t chunk_size = k * 3; in gemm_q8_1_0_7()
15295 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_0_7()
15296 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_0_7()
15300 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_0_7()
15301 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_0_7()
15302 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_0_7()
15303 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_0_7()
15306 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_0_7()
15308 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_0_7()
15309 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_0_7()
15310 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_0_7()
15313 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_0_7()
15314 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_0_7()
15354 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_1_0()
15355 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_1_0()
15356 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_1_0()
15357 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_1_0()
15358 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_1_0()
15359 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_0()
15360 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_0()
15361 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_0()
15362 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_0()
15363 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_0()
15364 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_0()
15368 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_0()
15369 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_0()
15370 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_0()
15371 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_0()
15374 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_0()
15376 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_0()
15377 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_0()
15378 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_0()
15381 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_0()
15382 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_0()
15427 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_1_1()
15428 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_1_1()
15429 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_1_1()
15430 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_1_1()
15431 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_1_1()
15432 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_1()
15433 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_1()
15434 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_1()
15435 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_1()
15436 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_1()
15437 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_1()
15441 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_1()
15442 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_1()
15443 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_1()
15444 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_1()
15447 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_1()
15449 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_1()
15450 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_1()
15451 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_1()
15454 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_1()
15455 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_1()
15500 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_1_2()
15501 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_1_2()
15502 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_1_2()
15503 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_1_2()
15504 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_1_2()
15505 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_2()
15506 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_2()
15507 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_2()
15508 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_2()
15509 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_2()
15510 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_2()
15514 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_2()
15515 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_2()
15516 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_2()
15517 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_2()
15520 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_2()
15522 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_2()
15523 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_2()
15524 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_2()
15527 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_2()
15528 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_2()
15573 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_1_3()
15574 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_1_3()
15575 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_1_3()
15576 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_1_3()
15577 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_1_3()
15578 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_3()
15579 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_3()
15580 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_3()
15581 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_3()
15582 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_3()
15583 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_3()
15587 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_3()
15588 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_3()
15589 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_3()
15590 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_3()
15593 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_3()
15595 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_3()
15596 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_3()
15597 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_3()
15600 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_3()
15601 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_3()
15646 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_1_4()
15647 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_1_4()
15648 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_1_4()
15649 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_1_4()
15650 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_1_4()
15651 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_4()
15652 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_4()
15653 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_4()
15654 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_4()
15655 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_4()
15656 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_4()
15660 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_4()
15661 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_4()
15662 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_4()
15663 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_4()
15666 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_4()
15668 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_4()
15669 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_4()
15670 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_4()
15673 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_4()
15674 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_4()
15719 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_1_5()
15720 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_1_5()
15721 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_1_5()
15722 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_1_5()
15723 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_1_5()
15724 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_5()
15725 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_5()
15726 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_5()
15727 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_5()
15728 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_5()
15729 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_5()
15733 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_5()
15734 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_5()
15735 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_5()
15736 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_5()
15739 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_5()
15741 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_5()
15742 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_5()
15743 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_5()
15746 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_5()
15747 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_5()
15792 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_1_6()
15793 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_1_6()
15794 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_1_6()
15795 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_1_6()
15796 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_1_6()
15797 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_6()
15798 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_6()
15799 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_6()
15800 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_6()
15801 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_6()
15802 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_6()
15806 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_6()
15807 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_6()
15808 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_6()
15809 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_6()
15812 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_6()
15814 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_6()
15815 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_6()
15816 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_6()
15819 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_6()
15820 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_6()
15865 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_1_7()
15866 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_1_7()
15867 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_1_7()
15868 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_1_7()
15869 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_1_7()
15870 const std::int32_t row_chunks = m / 3; in gemm_q8_1_1_7()
15871 const std::int32_t col_chunks = n / 3; in gemm_q8_1_1_7()
15872 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_1_7()
15873 const std::int32_t chunk_size = k * 3; in gemm_q8_1_1_7()
15874 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_1_7()
15875 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_1_7()
15879 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_1_7()
15880 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_1_7()
15881 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_1_7()
15882 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_1_7()
15885 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_1_7()
15887 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_1_7()
15888 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_1_7()
15889 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_1_7()
15892 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_1_7()
15893 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_1_7()
15938 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_2_0()
15939 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_2_0()
15940 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_2_0()
15941 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_2_0()
15942 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_2_0()
15943 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_0()
15944 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_0()
15945 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_0()
15946 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_0()
15947 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_0()
15948 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_0()
15952 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_0()
15953 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_0()
15954 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_0()
15955 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_0()
15958 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_0()
15960 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_0()
15961 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_0()
15962 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_0()
15965 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_0()
15966 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_0()
16011 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_2_1()
16012 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_2_1()
16013 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_2_1()
16014 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_2_1()
16015 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_2_1()
16016 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_1()
16017 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_1()
16018 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_1()
16019 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_1()
16020 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_1()
16021 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_1()
16025 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_1()
16026 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_1()
16027 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_1()
16028 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_1()
16031 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_1()
16033 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_1()
16034 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_1()
16035 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_1()
16038 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_1()
16039 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_1()
16084 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_2_2()
16085 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_2_2()
16086 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_2_2()
16087 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_2_2()
16088 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_2_2()
16089 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_2()
16090 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_2()
16091 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_2()
16092 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_2()
16093 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_2()
16094 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_2()
16098 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_2()
16099 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_2()
16100 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_2()
16101 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_2()
16104 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_2()
16106 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_2()
16107 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_2()
16108 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_2()
16111 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_2()
16112 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_2()
16157 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_2_3()
16158 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_2_3()
16159 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_2_3()
16160 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_2_3()
16161 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_2_3()
16162 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_3()
16163 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_3()
16164 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_3()
16165 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_3()
16166 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_3()
16167 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_3()
16171 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_3()
16172 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_3()
16173 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_3()
16174 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_3()
16177 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_3()
16179 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_3()
16180 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_3()
16181 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_3()
16184 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_3()
16185 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_3()
16230 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_2_4()
16231 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_2_4()
16232 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_2_4()
16233 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_2_4()
16234 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_2_4()
16235 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_4()
16236 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_4()
16237 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_4()
16238 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_4()
16239 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_4()
16240 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_4()
16244 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_4()
16245 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_4()
16246 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_4()
16247 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_4()
16250 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_4()
16252 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_4()
16253 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_4()
16254 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_4()
16257 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_4()
16258 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_4()
16303 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_2_5()
16304 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_2_5()
16305 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_2_5()
16306 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_2_5()
16307 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_2_5()
16308 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_5()
16309 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_5()
16310 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_5()
16311 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_5()
16312 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_5()
16313 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_5()
16317 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_5()
16318 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_5()
16319 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_5()
16320 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_5()
16323 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_5()
16325 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_5()
16326 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_5()
16327 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_5()
16330 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_5()
16331 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_5()
16376 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_2_6()
16377 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_2_6()
16378 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_2_6()
16379 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_2_6()
16380 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_2_6()
16381 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_6()
16382 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_6()
16383 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_6()
16384 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_6()
16385 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_6()
16386 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_6()
16390 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_6()
16391 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_6()
16392 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_6()
16393 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_6()
16396 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_6()
16398 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_6()
16399 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_6()
16400 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_6()
16403 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_6()
16404 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_6()
16449 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_1_2_7()
16450 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_1_2_7()
16451 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_1_2_7()
16452 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_1_2_7()
16453 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_1_2_7()
16454 const std::int32_t row_chunks = m / 3; in gemm_q8_1_2_7()
16455 const std::int32_t col_chunks = n / 3; in gemm_q8_1_2_7()
16456 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_1_2_7()
16457 const std::int32_t chunk_size = k * 3; in gemm_q8_1_2_7()
16458 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_1_2_7()
16459 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_1_2_7()
16463 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_1_2_7()
16464 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_1_2_7()
16465 std::int32_t* zipped_lhs_1_offsets = in gemm_q8_1_2_7()
16466 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_q8_1_2_7()
16469 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_1_2_7()
16471 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_1_2_7()
16472 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_1_2_7()
16473 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_1_2_7()
16476 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_1_2_7()
16477 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_1_2_7()
16522 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_0_0()
16523 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_0_0()
16524 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_0_0()
16525 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_0_0()
16526 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_0_0()
16527 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_0()
16528 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_0()
16529 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_0()
16530 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_0()
16531 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_0()
16532 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_0()
16536 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_0()
16537 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_0()
16538 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_0()
16539 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_0()
16542 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_0()
16544 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_0()
16545 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_0()
16546 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_0()
16549 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_0()
16550 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_0()
16590 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_0_1()
16591 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_0_1()
16592 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_0_1()
16593 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_0_1()
16594 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_0_1()
16595 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_1()
16596 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_1()
16597 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_1()
16598 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_1()
16599 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_1()
16600 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_1()
16604 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_1()
16605 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_1()
16606 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_1()
16607 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_1()
16610 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_1()
16612 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_1()
16613 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_1()
16614 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_1()
16617 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_1()
16618 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_1()
16658 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_0_2()
16659 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_0_2()
16660 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_0_2()
16661 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_0_2()
16662 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_0_2()
16663 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_2()
16664 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_2()
16665 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_2()
16666 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_2()
16667 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_2()
16668 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_2()
16672 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_2()
16673 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_2()
16674 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_2()
16675 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_2()
16678 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_2()
16680 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_2()
16681 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_2()
16682 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_2()
16685 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_2()
16686 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_2()
16726 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_0_3()
16727 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_0_3()
16728 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_0_3()
16729 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_0_3()
16730 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_0_3()
16731 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_3()
16732 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_3()
16733 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_3()
16734 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_3()
16735 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_3()
16736 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_3()
16740 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_3()
16741 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_3()
16742 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_3()
16743 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_3()
16746 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_3()
16748 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_3()
16749 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_3()
16750 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_3()
16753 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_3()
16754 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_3()
16794 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_0_4()
16795 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_0_4()
16796 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_0_4()
16797 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_0_4()
16798 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_0_4()
16799 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_4()
16800 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_4()
16801 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_4()
16802 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_4()
16803 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_4()
16804 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_4()
16808 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_4()
16809 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_4()
16810 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_4()
16811 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_4()
16814 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_4()
16816 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_4()
16817 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_4()
16818 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_4()
16821 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_4()
16822 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_4()
16862 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_0_5()
16863 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_0_5()
16864 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_0_5()
16865 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_0_5()
16866 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_0_5()
16867 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_5()
16868 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_5()
16869 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_5()
16870 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_5()
16871 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_5()
16872 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_5()
16876 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_5()
16877 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_5()
16878 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_5()
16879 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_5()
16882 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_5()
16884 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_5()
16885 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_5()
16886 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_5()
16889 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_5()
16890 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_5()
16930 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_0_6()
16931 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_0_6()
16932 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_0_6()
16933 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_0_6()
16934 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_0_6()
16935 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_6()
16936 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_6()
16937 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_6()
16938 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_6()
16939 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_6()
16940 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_6()
16944 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_6()
16945 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_6()
16946 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_6()
16947 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_6()
16950 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_6()
16952 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_6()
16953 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_6()
16954 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_6()
16957 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_6()
16958 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_6()
16998 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_0_7()
16999 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_0_7()
17000 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_0_7()
17001 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_0_7()
17002 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_0_7()
17003 const std::int32_t row_chunks = m / 3; in gemm_q8_2_0_7()
17004 const std::int32_t col_chunks = n / 3; in gemm_q8_2_0_7()
17005 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_0_7()
17006 const std::int32_t chunk_size = k * 3; in gemm_q8_2_0_7()
17007 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_0_7()
17008 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_0_7()
17012 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_0_7()
17013 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_0_7()
17014 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_0_7()
17015 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_0_7()
17018 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_0_7()
17020 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_0_7()
17021 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_0_7()
17022 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_0_7()
17025 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_0_7()
17026 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_0_7()
17066 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_1_0()
17067 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_1_0()
17068 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_1_0()
17069 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_1_0()
17070 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_1_0()
17071 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_0()
17072 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_0()
17073 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_0()
17074 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_0()
17075 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_0()
17076 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_0()
17080 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_0()
17081 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_0()
17082 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_0()
17083 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_0()
17086 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_0()
17088 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_0()
17089 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_0()
17090 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_0()
17093 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_0()
17094 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_0()
17139 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_1_1()
17140 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_1_1()
17141 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_1_1()
17142 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_1_1()
17143 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_1_1()
17144 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_1()
17145 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_1()
17146 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_1()
17147 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_1()
17148 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_1()
17149 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_1()
17153 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_1()
17154 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_1()
17155 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_1()
17156 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_1()
17159 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_1()
17161 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_1()
17162 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_1()
17163 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_1()
17166 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_1()
17167 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_1()
17212 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_1_2()
17213 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_1_2()
17214 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_1_2()
17215 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_1_2()
17216 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_1_2()
17217 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_2()
17218 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_2()
17219 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_2()
17220 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_2()
17221 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_2()
17222 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_2()
17226 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_2()
17227 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_2()
17228 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_2()
17229 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_2()
17232 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_2()
17234 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_2()
17235 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_2()
17236 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_2()
17239 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_2()
17240 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_2()
17285 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_1_3()
17286 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_1_3()
17287 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_1_3()
17288 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_1_3()
17289 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_1_3()
17290 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_3()
17291 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_3()
17292 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_3()
17293 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_3()
17294 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_3()
17295 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_3()
17299 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_3()
17300 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_3()
17301 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_3()
17302 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_3()
17305 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_3()
17307 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_3()
17308 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_3()
17309 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_3()
17312 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_3()
17313 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_3()
17358 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_1_4()
17359 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_1_4()
17360 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_1_4()
17361 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_1_4()
17362 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_1_4()
17363 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_4()
17364 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_4()
17365 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_4()
17366 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_4()
17367 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_4()
17368 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_4()
17372 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_4()
17373 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_4()
17374 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_4()
17375 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_4()
17378 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_4()
17380 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_4()
17381 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_4()
17382 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_4()
17385 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_4()
17386 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_4()
17431 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_1_5()
17432 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_1_5()
17433 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_1_5()
17434 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_1_5()
17435 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_1_5()
17436 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_5()
17437 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_5()
17438 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_5()
17439 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_5()
17440 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_5()
17441 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_5()
17445 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_5()
17446 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_5()
17447 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_5()
17448 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_5()
17451 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_5()
17453 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_5()
17454 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_5()
17455 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_5()
17458 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_5()
17459 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_5()
17504 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_1_6()
17505 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_1_6()
17506 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_1_6()
17507 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_1_6()
17508 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_1_6()
17509 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_6()
17510 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_6()
17511 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_6()
17512 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_6()
17513 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_6()
17514 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_6()
17518 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_6()
17519 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_6()
17520 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_6()
17521 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_6()
17524 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_6()
17526 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_6()
17527 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_6()
17528 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_6()
17531 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_6()
17532 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_6()
17577 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_1_7()
17578 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_1_7()
17579 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_1_7()
17580 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_1_7()
17581 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_1_7()
17582 const std::int32_t row_chunks = m / 3; in gemm_q8_2_1_7()
17583 const std::int32_t col_chunks = n / 3; in gemm_q8_2_1_7()
17584 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_1_7()
17585 const std::int32_t chunk_size = k * 3; in gemm_q8_2_1_7()
17586 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_1_7()
17587 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_1_7()
17591 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_1_7()
17592 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_1_7()
17593 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_1_7()
17594 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_1_7()
17597 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_1_7()
17599 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_1_7()
17600 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_1_7()
17601 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_1_7()
17604 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_1_7()
17605 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_1_7()
17650 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_2_0()
17651 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_2_0()
17652 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_2_0()
17653 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_2_0()
17654 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_2_0()
17655 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_0()
17656 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_0()
17657 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_0()
17658 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_0()
17659 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_0()
17660 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_0()
17664 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_0()
17665 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_0()
17666 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_0()
17667 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_0()
17670 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_0()
17672 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_0()
17673 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_0()
17674 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_0()
17677 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_0()
17678 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_0()
17723 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_2_1()
17724 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_2_1()
17725 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_2_1()
17726 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_2_1()
17727 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_2_1()
17728 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_1()
17729 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_1()
17730 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_1()
17731 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_1()
17732 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_1()
17733 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_1()
17737 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_1()
17738 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_1()
17739 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_1()
17740 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_1()
17743 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_1()
17745 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_1()
17746 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_1()
17747 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_1()
17750 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_1()
17751 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_1()
17796 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_2_2()
17797 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_2_2()
17798 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_2_2()
17799 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_2_2()
17800 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_2_2()
17801 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_2()
17802 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_2()
17803 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_2()
17804 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_2()
17805 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_2()
17806 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_2()
17810 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_2()
17811 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_2()
17812 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_2()
17813 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_2()
17816 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_2()
17818 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_2()
17819 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_2()
17820 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_2()
17823 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_2()
17824 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_2()
17869 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_2_3()
17870 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_2_3()
17871 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_2_3()
17872 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_2_3()
17873 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_2_3()
17874 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_3()
17875 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_3()
17876 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_3()
17877 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_3()
17878 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_3()
17879 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_3()
17883 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_3()
17884 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_3()
17885 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_3()
17886 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_3()
17889 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_3()
17891 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_3()
17892 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_3()
17893 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_3()
17896 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_3()
17897 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_3()
17942 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_2_4()
17943 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_2_4()
17944 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_2_4()
17945 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_2_4()
17946 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_2_4()
17947 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_4()
17948 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_4()
17949 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_4()
17950 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_4()
17951 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_4()
17952 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_4()
17956 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_4()
17957 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_4()
17958 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_4()
17959 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_4()
17962 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_4()
17964 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_4()
17965 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_4()
17966 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_4()
17969 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_4()
17970 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_4()
18015 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_2_5()
18016 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_2_5()
18017 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_2_5()
18018 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_2_5()
18019 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_2_5()
18020 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_5()
18021 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_5()
18022 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_5()
18023 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_5()
18024 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_5()
18025 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_5()
18029 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_5()
18030 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_5()
18031 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_5()
18032 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_5()
18035 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_5()
18037 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_5()
18038 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_5()
18039 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_5()
18042 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_5()
18043 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_5()
18088 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_2_6()
18089 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_2_6()
18090 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_2_6()
18091 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_2_6()
18092 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_2_6()
18093 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_6()
18094 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_6()
18095 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_6()
18096 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_6()
18097 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_6()
18098 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_6()
18102 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_6()
18103 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_6()
18104 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_6()
18105 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_6()
18108 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_6()
18110 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_6()
18111 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_6()
18112 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_6()
18115 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_6()
18116 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_6()
18161 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_2_2_7()
18162 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_2_2_7()
18163 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_2_2_7()
18164 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_2_2_7()
18165 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_2_2_7()
18166 const std::int32_t row_chunks = m / 3; in gemm_q8_2_2_7()
18167 const std::int32_t col_chunks = n / 3; in gemm_q8_2_2_7()
18168 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_q8_2_2_7()
18169 const std::int32_t chunk_size = k * 3; in gemm_q8_2_2_7()
18170 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_q8_2_2_7()
18171 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_q8_2_2_7()
18175 std::int32_t* zipped_lhs_3_offsets = in gemm_q8_2_2_7()
18176 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_q8_2_2_7()
18177 std::int32_t* zipped_lhs_2_offsets = in gemm_q8_2_2_7()
18178 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_q8_2_2_7()
18181 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_q8_2_2_7()
18183 const std::int32_t const_offset = lhs_offset * rhs_offset * k + result_offset; in gemm_q8_2_2_7()
18184 const std::int32_t rounding_offset = (1 << (shift - 1)); in gemm_q8_2_2_7()
18185 std::int32_t* temp_result = reinterpret_cast<std::int32_t*>( in gemm_q8_2_2_7()
18188 std::int32_t* mul_result_chunk = temp_result; in gemm_q8_2_2_7()
18189 const std::int32_t mul_result_chunk_stride_bytes = ((n * 4 + 7) / 8) * 8; in gemm_q8_2_2_7()
18234 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_0_0_aligned()
18235 std::int32_t n, std::int32_t k, in gemm_i32_0_0_0_aligned()
18236 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_0_0_aligned()
18237 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_0_0_aligned()
18238 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_0_aligned()
18239 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_0_aligned()
18240 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_0_aligned()
18241 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_0_aligned()
18242 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_0_aligned()
18243 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_0_aligned()
18247 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_0_aligned()
18248 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_0_aligned()
18251 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_0_aligned()
18253 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_0_aligned()
18254 std::int32_t* result_chunk = result; in gemm_i32_0_0_0_aligned()
18255 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_0_aligned()
18256 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_0_aligned()
18281 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_0_1_aligned()
18282 std::int32_t n, std::int32_t k, in gemm_i32_0_0_1_aligned()
18283 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_0_1_aligned()
18284 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_0_1_aligned()
18285 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_1_aligned()
18286 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_1_aligned()
18287 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_1_aligned()
18288 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_1_aligned()
18289 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_1_aligned()
18290 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_1_aligned()
18294 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_1_aligned()
18295 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_1_aligned()
18298 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_1_aligned()
18300 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_1_aligned()
18301 std::int32_t* result_chunk = result; in gemm_i32_0_0_1_aligned()
18302 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_1_aligned()
18303 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_1_aligned()
18328 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_0_2_aligned()
18329 std::int32_t n, std::int32_t k, in gemm_i32_0_0_2_aligned()
18330 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_0_2_aligned()
18331 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_0_2_aligned()
18332 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_2_aligned()
18333 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_2_aligned()
18334 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_2_aligned()
18335 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_2_aligned()
18336 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_2_aligned()
18337 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_2_aligned()
18341 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_2_aligned()
18342 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_2_aligned()
18345 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_2_aligned()
18347 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_2_aligned()
18348 std::int32_t* result_chunk = result; in gemm_i32_0_0_2_aligned()
18349 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_2_aligned()
18350 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_2_aligned()
18375 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_0_3_aligned()
18376 std::int32_t n, std::int32_t k, in gemm_i32_0_0_3_aligned()
18377 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_0_3_aligned()
18378 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_0_3_aligned()
18379 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_3_aligned()
18380 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_3_aligned()
18381 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_3_aligned()
18382 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_3_aligned()
18383 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_3_aligned()
18384 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_3_aligned()
18388 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_3_aligned()
18389 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_3_aligned()
18392 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_3_aligned()
18394 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_3_aligned()
18395 std::int32_t* result_chunk = result; in gemm_i32_0_0_3_aligned()
18396 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_3_aligned()
18397 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_3_aligned()
18422 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_0_4_aligned()
18423 std::int32_t n, std::int32_t k, in gemm_i32_0_0_4_aligned()
18424 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_0_4_aligned()
18425 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_0_4_aligned()
18426 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_4_aligned()
18427 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_4_aligned()
18428 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_4_aligned()
18429 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_4_aligned()
18430 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_4_aligned()
18431 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_4_aligned()
18435 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_4_aligned()
18436 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_4_aligned()
18439 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_4_aligned()
18441 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_4_aligned()
18442 std::int32_t* result_chunk = result; in gemm_i32_0_0_4_aligned()
18443 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_4_aligned()
18444 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_4_aligned()
18469 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_0_5_aligned()
18470 std::int32_t n, std::int32_t k, in gemm_i32_0_0_5_aligned()
18471 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_0_5_aligned()
18472 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_0_5_aligned()
18473 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_5_aligned()
18474 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_5_aligned()
18475 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_5_aligned()
18476 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_5_aligned()
18477 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_5_aligned()
18478 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_5_aligned()
18482 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_5_aligned()
18483 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_5_aligned()
18486 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_5_aligned()
18488 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_5_aligned()
18489 std::int32_t* result_chunk = result; in gemm_i32_0_0_5_aligned()
18490 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_5_aligned()
18491 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_5_aligned()
18516 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_0_6_aligned()
18517 std::int32_t n, std::int32_t k, in gemm_i32_0_0_6_aligned()
18518 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_0_6_aligned()
18519 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_0_6_aligned()
18520 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_6_aligned()
18521 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_6_aligned()
18522 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_6_aligned()
18523 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_6_aligned()
18524 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_6_aligned()
18525 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_6_aligned()
18529 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_6_aligned()
18530 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_6_aligned()
18533 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_6_aligned()
18535 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_6_aligned()
18536 std::int32_t* result_chunk = result; in gemm_i32_0_0_6_aligned()
18537 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_6_aligned()
18538 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_6_aligned()
18563 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_0_7_aligned()
18564 std::int32_t n, std::int32_t k, in gemm_i32_0_0_7_aligned()
18565 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_0_7_aligned()
18566 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_0_7_aligned()
18567 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_7_aligned()
18568 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_7_aligned()
18569 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_7_aligned()
18570 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_7_aligned()
18571 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_7_aligned()
18572 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_7_aligned()
18576 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_7_aligned()
18577 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_7_aligned()
18580 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_7_aligned()
18582 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_7_aligned()
18583 std::int32_t* result_chunk = result; in gemm_i32_0_0_7_aligned()
18584 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_7_aligned()
18585 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_7_aligned()
18610 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_1_0_aligned()
18611 std::int32_t n, std::int32_t k, in gemm_i32_0_1_0_aligned()
18612 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_1_0_aligned()
18613 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_1_0_aligned()
18614 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_0_aligned()
18615 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_0_aligned()
18616 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_0_aligned()
18617 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_0_aligned()
18618 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_0_aligned()
18619 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_0_aligned()
18623 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_0_aligned()
18624 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_0_aligned()
18627 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_0_aligned()
18629 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_0_aligned()
18630 std::int32_t* result_chunk = result; in gemm_i32_0_1_0_aligned()
18631 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_0_aligned()
18632 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_0_aligned()
18661 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_1_1_aligned()
18662 std::int32_t n, std::int32_t k, in gemm_i32_0_1_1_aligned()
18663 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_1_1_aligned()
18664 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_1_1_aligned()
18665 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_1_aligned()
18666 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_1_aligned()
18667 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_1_aligned()
18668 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_1_aligned()
18669 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_1_aligned()
18670 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_1_aligned()
18674 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_1_aligned()
18675 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_1_aligned()
18678 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_1_aligned()
18680 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_1_aligned()
18681 std::int32_t* result_chunk = result; in gemm_i32_0_1_1_aligned()
18682 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_1_aligned()
18683 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_1_aligned()
18712 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_1_2_aligned()
18713 std::int32_t n, std::int32_t k, in gemm_i32_0_1_2_aligned()
18714 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_1_2_aligned()
18715 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_1_2_aligned()
18716 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_2_aligned()
18717 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_2_aligned()
18718 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_2_aligned()
18719 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_2_aligned()
18720 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_2_aligned()
18721 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_2_aligned()
18725 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_2_aligned()
18726 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_2_aligned()
18729 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_2_aligned()
18731 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_2_aligned()
18732 std::int32_t* result_chunk = result; in gemm_i32_0_1_2_aligned()
18733 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_2_aligned()
18734 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_2_aligned()
18763 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_1_3_aligned()
18764 std::int32_t n, std::int32_t k, in gemm_i32_0_1_3_aligned()
18765 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_1_3_aligned()
18766 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_1_3_aligned()
18767 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_3_aligned()
18768 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_3_aligned()
18769 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_3_aligned()
18770 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_3_aligned()
18771 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_3_aligned()
18772 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_3_aligned()
18776 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_3_aligned()
18777 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_3_aligned()
18780 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_3_aligned()
18782 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_3_aligned()
18783 std::int32_t* result_chunk = result; in gemm_i32_0_1_3_aligned()
18784 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_3_aligned()
18785 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_3_aligned()
18814 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_1_4_aligned()
18815 std::int32_t n, std::int32_t k, in gemm_i32_0_1_4_aligned()
18816 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_1_4_aligned()
18817 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_1_4_aligned()
18818 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_4_aligned()
18819 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_4_aligned()
18820 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_4_aligned()
18821 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_4_aligned()
18822 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_4_aligned()
18823 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_4_aligned()
18827 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_4_aligned()
18828 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_4_aligned()
18831 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_4_aligned()
18833 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_4_aligned()
18834 std::int32_t* result_chunk = result; in gemm_i32_0_1_4_aligned()
18835 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_4_aligned()
18836 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_4_aligned()
18865 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_1_5_aligned()
18866 std::int32_t n, std::int32_t k, in gemm_i32_0_1_5_aligned()
18867 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_1_5_aligned()
18868 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_1_5_aligned()
18869 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_5_aligned()
18870 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_5_aligned()
18871 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_5_aligned()
18872 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_5_aligned()
18873 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_5_aligned()
18874 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_5_aligned()
18878 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_5_aligned()
18879 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_5_aligned()
18882 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_5_aligned()
18884 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_5_aligned()
18885 std::int32_t* result_chunk = result; in gemm_i32_0_1_5_aligned()
18886 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_5_aligned()
18887 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_5_aligned()
18916 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_1_6_aligned()
18917 std::int32_t n, std::int32_t k, in gemm_i32_0_1_6_aligned()
18918 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_1_6_aligned()
18919 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_1_6_aligned()
18920 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_6_aligned()
18921 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_6_aligned()
18922 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_6_aligned()
18923 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_6_aligned()
18924 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_6_aligned()
18925 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_6_aligned()
18929 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_6_aligned()
18930 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_6_aligned()
18933 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_6_aligned()
18935 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_6_aligned()
18936 std::int32_t* result_chunk = result; in gemm_i32_0_1_6_aligned()
18937 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_6_aligned()
18938 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_6_aligned()
18967 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_1_7_aligned()
18968 std::int32_t n, std::int32_t k, in gemm_i32_0_1_7_aligned()
18969 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_1_7_aligned()
18970 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_1_7_aligned()
18971 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_7_aligned()
18972 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_7_aligned()
18973 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_7_aligned()
18974 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_7_aligned()
18975 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_7_aligned()
18976 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_7_aligned()
18980 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_7_aligned()
18981 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_7_aligned()
18984 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_7_aligned()
18986 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_7_aligned()
18987 std::int32_t* result_chunk = result; in gemm_i32_0_1_7_aligned()
18988 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_7_aligned()
18989 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_7_aligned()
19018 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_2_0_aligned()
19019 std::int32_t n, std::int32_t k, in gemm_i32_0_2_0_aligned()
19020 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_2_0_aligned()
19021 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_2_0_aligned()
19022 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_0_aligned()
19023 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_0_aligned()
19024 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_0_aligned()
19025 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_0_aligned()
19026 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_0_aligned()
19027 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_0_aligned()
19031 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_0_aligned()
19032 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_0_aligned()
19035 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_0_aligned()
19037 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_0_aligned()
19038 std::int32_t* result_chunk = result; in gemm_i32_0_2_0_aligned()
19039 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_0_aligned()
19040 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_0_aligned()
19069 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_2_1_aligned()
19070 std::int32_t n, std::int32_t k, in gemm_i32_0_2_1_aligned()
19071 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_2_1_aligned()
19072 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_2_1_aligned()
19073 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_1_aligned()
19074 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_1_aligned()
19075 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_1_aligned()
19076 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_1_aligned()
19077 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_1_aligned()
19078 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_1_aligned()
19082 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_1_aligned()
19083 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_1_aligned()
19086 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_1_aligned()
19088 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_1_aligned()
19089 std::int32_t* result_chunk = result; in gemm_i32_0_2_1_aligned()
19090 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_1_aligned()
19091 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_1_aligned()
19120 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_2_2_aligned()
19121 std::int32_t n, std::int32_t k, in gemm_i32_0_2_2_aligned()
19122 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_2_2_aligned()
19123 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_2_2_aligned()
19124 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_2_aligned()
19125 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_2_aligned()
19126 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_2_aligned()
19127 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_2_aligned()
19128 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_2_aligned()
19129 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_2_aligned()
19133 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_2_aligned()
19134 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_2_aligned()
19137 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_2_aligned()
19139 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_2_aligned()
19140 std::int32_t* result_chunk = result; in gemm_i32_0_2_2_aligned()
19141 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_2_aligned()
19142 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_2_aligned()
19171 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_2_3_aligned()
19172 std::int32_t n, std::int32_t k, in gemm_i32_0_2_3_aligned()
19173 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_2_3_aligned()
19174 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_2_3_aligned()
19175 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_3_aligned()
19176 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_3_aligned()
19177 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_3_aligned()
19178 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_3_aligned()
19179 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_3_aligned()
19180 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_3_aligned()
19184 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_3_aligned()
19185 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_3_aligned()
19188 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_3_aligned()
19190 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_3_aligned()
19191 std::int32_t* result_chunk = result; in gemm_i32_0_2_3_aligned()
19192 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_3_aligned()
19193 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_3_aligned()
19222 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_2_4_aligned()
19223 std::int32_t n, std::int32_t k, in gemm_i32_0_2_4_aligned()
19224 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_2_4_aligned()
19225 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_2_4_aligned()
19226 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_4_aligned()
19227 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_4_aligned()
19228 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_4_aligned()
19229 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_4_aligned()
19230 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_4_aligned()
19231 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_4_aligned()
19235 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_4_aligned()
19236 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_4_aligned()
19239 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_4_aligned()
19241 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_4_aligned()
19242 std::int32_t* result_chunk = result; in gemm_i32_0_2_4_aligned()
19243 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_4_aligned()
19244 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_4_aligned()
19273 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_2_5_aligned()
19274 std::int32_t n, std::int32_t k, in gemm_i32_0_2_5_aligned()
19275 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_2_5_aligned()
19276 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_2_5_aligned()
19277 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_5_aligned()
19278 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_5_aligned()
19279 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_5_aligned()
19280 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_5_aligned()
19281 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_5_aligned()
19282 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_5_aligned()
19286 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_5_aligned()
19287 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_5_aligned()
19290 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_5_aligned()
19292 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_5_aligned()
19293 std::int32_t* result_chunk = result; in gemm_i32_0_2_5_aligned()
19294 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_5_aligned()
19295 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_5_aligned()
19324 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_2_6_aligned()
19325 std::int32_t n, std::int32_t k, in gemm_i32_0_2_6_aligned()
19326 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_2_6_aligned()
19327 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_2_6_aligned()
19328 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_6_aligned()
19329 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_6_aligned()
19330 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_6_aligned()
19331 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_6_aligned()
19332 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_6_aligned()
19333 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_6_aligned()
19337 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_6_aligned()
19338 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_6_aligned()
19341 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_6_aligned()
19343 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_6_aligned()
19344 std::int32_t* result_chunk = result; in gemm_i32_0_2_6_aligned()
19345 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_6_aligned()
19346 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_6_aligned()
19375 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_0_2_7_aligned()
19376 std::int32_t n, std::int32_t k, in gemm_i32_0_2_7_aligned()
19377 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_0_2_7_aligned()
19378 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_0_2_7_aligned()
19379 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_7_aligned()
19380 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_7_aligned()
19381 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_7_aligned()
19382 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_7_aligned()
19383 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_7_aligned()
19384 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_7_aligned()
19388 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_7_aligned()
19389 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_7_aligned()
19392 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_7_aligned()
19394 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_7_aligned()
19395 std::int32_t* result_chunk = result; in gemm_i32_0_2_7_aligned()
19396 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_7_aligned()
19397 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_7_aligned()
19426 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_0_0_aligned()
19427 std::int32_t n, std::int32_t k, in gemm_i32_1_0_0_aligned()
19428 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_0_0_aligned()
19429 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_0_0_aligned()
19430 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_0_aligned()
19431 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_0_aligned()
19432 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_0_aligned()
19433 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_0_aligned()
19434 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_0_aligned()
19435 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_0_aligned()
19439 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_0_aligned()
19440 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_0_aligned()
19441 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_0_aligned()
19442 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_0_aligned()
19445 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_0_aligned()
19447 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_0_aligned()
19448 std::int32_t* result_chunk = result; in gemm_i32_1_0_0_aligned()
19449 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_0_aligned()
19450 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_0_aligned()
19486 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_0_1_aligned()
19487 std::int32_t n, std::int32_t k, in gemm_i32_1_0_1_aligned()
19488 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_0_1_aligned()
19489 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_0_1_aligned()
19490 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_1_aligned()
19491 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_1_aligned()
19492 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_1_aligned()
19493 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_1_aligned()
19494 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_1_aligned()
19495 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_1_aligned()
19499 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_1_aligned()
19500 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_1_aligned()
19501 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_1_aligned()
19502 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_1_aligned()
19505 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_1_aligned()
19507 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_1_aligned()
19508 std::int32_t* result_chunk = result; in gemm_i32_1_0_1_aligned()
19509 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_1_aligned()
19510 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_1_aligned()
19546 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_0_2_aligned()
19547 std::int32_t n, std::int32_t k, in gemm_i32_1_0_2_aligned()
19548 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_0_2_aligned()
19549 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_0_2_aligned()
19550 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_2_aligned()
19551 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_2_aligned()
19552 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_2_aligned()
19553 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_2_aligned()
19554 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_2_aligned()
19555 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_2_aligned()
19559 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_2_aligned()
19560 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_2_aligned()
19561 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_2_aligned()
19562 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_2_aligned()
19565 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_2_aligned()
19567 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_2_aligned()
19568 std::int32_t* result_chunk = result; in gemm_i32_1_0_2_aligned()
19569 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_2_aligned()
19570 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_2_aligned()
19606 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_0_3_aligned()
19607 std::int32_t n, std::int32_t k, in gemm_i32_1_0_3_aligned()
19608 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_0_3_aligned()
19609 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_0_3_aligned()
19610 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_3_aligned()
19611 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_3_aligned()
19612 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_3_aligned()
19613 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_3_aligned()
19614 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_3_aligned()
19615 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_3_aligned()
19619 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_3_aligned()
19620 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_3_aligned()
19621 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_3_aligned()
19622 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_3_aligned()
19625 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_3_aligned()
19627 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_3_aligned()
19628 std::int32_t* result_chunk = result; in gemm_i32_1_0_3_aligned()
19629 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_3_aligned()
19630 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_3_aligned()
19666 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_0_4_aligned()
19667 std::int32_t n, std::int32_t k, in gemm_i32_1_0_4_aligned()
19668 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_0_4_aligned()
19669 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_0_4_aligned()
19670 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_4_aligned()
19671 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_4_aligned()
19672 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_4_aligned()
19673 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_4_aligned()
19674 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_4_aligned()
19675 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_4_aligned()
19679 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_4_aligned()
19680 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_4_aligned()
19681 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_4_aligned()
19682 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_4_aligned()
19685 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_4_aligned()
19687 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_4_aligned()
19688 std::int32_t* result_chunk = result; in gemm_i32_1_0_4_aligned()
19689 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_4_aligned()
19690 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_4_aligned()
19726 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_0_5_aligned()
19727 std::int32_t n, std::int32_t k, in gemm_i32_1_0_5_aligned()
19728 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_0_5_aligned()
19729 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_0_5_aligned()
19730 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_5_aligned()
19731 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_5_aligned()
19732 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_5_aligned()
19733 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_5_aligned()
19734 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_5_aligned()
19735 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_5_aligned()
19739 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_5_aligned()
19740 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_5_aligned()
19741 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_5_aligned()
19742 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_5_aligned()
19745 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_5_aligned()
19747 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_5_aligned()
19748 std::int32_t* result_chunk = result; in gemm_i32_1_0_5_aligned()
19749 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_5_aligned()
19750 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_5_aligned()
19786 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_0_6_aligned()
19787 std::int32_t n, std::int32_t k, in gemm_i32_1_0_6_aligned()
19788 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_0_6_aligned()
19789 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_0_6_aligned()
19790 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_6_aligned()
19791 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_6_aligned()
19792 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_6_aligned()
19793 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_6_aligned()
19794 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_6_aligned()
19795 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_6_aligned()
19799 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_6_aligned()
19800 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_6_aligned()
19801 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_6_aligned()
19802 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_6_aligned()
19805 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_6_aligned()
19807 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_6_aligned()
19808 std::int32_t* result_chunk = result; in gemm_i32_1_0_6_aligned()
19809 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_6_aligned()
19810 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_6_aligned()
19846 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_0_7_aligned()
19847 std::int32_t n, std::int32_t k, in gemm_i32_1_0_7_aligned()
19848 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_0_7_aligned()
19849 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_0_7_aligned()
19850 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_7_aligned()
19851 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_7_aligned()
19852 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_7_aligned()
19853 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_7_aligned()
19854 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_7_aligned()
19855 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_7_aligned()
19859 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_7_aligned()
19860 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_7_aligned()
19861 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_7_aligned()
19862 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_7_aligned()
19865 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_7_aligned()
19867 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_7_aligned()
19868 std::int32_t* result_chunk = result; in gemm_i32_1_0_7_aligned()
19869 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_7_aligned()
19870 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_7_aligned()
19906 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_1_0_aligned()
19907 std::int32_t n, std::int32_t k, in gemm_i32_1_1_0_aligned()
19908 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_1_0_aligned()
19909 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_1_0_aligned()
19910 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_0_aligned()
19911 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_0_aligned()
19912 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_0_aligned()
19913 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_0_aligned()
19914 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_0_aligned()
19915 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_0_aligned()
19919 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_0_aligned()
19920 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_0_aligned()
19921 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_0_aligned()
19922 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_0_aligned()
19925 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_0_aligned()
19927 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_0_aligned()
19928 std::int32_t* result_chunk = result; in gemm_i32_1_1_0_aligned()
19929 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_0_aligned()
19930 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_0_aligned()
19973 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_1_1_aligned()
19974 std::int32_t n, std::int32_t k, in gemm_i32_1_1_1_aligned()
19975 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_1_1_aligned()
19976 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_1_1_aligned()
19977 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_1_aligned()
19978 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_1_aligned()
19979 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_1_aligned()
19980 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_1_aligned()
19981 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_1_aligned()
19982 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_1_aligned()
19986 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_1_aligned()
19987 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_1_aligned()
19988 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_1_aligned()
19989 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_1_aligned()
19992 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_1_aligned()
19994 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_1_aligned()
19995 std::int32_t* result_chunk = result; in gemm_i32_1_1_1_aligned()
19996 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_1_aligned()
19997 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_1_aligned()
20040 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_1_2_aligned()
20041 std::int32_t n, std::int32_t k, in gemm_i32_1_1_2_aligned()
20042 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_1_2_aligned()
20043 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_1_2_aligned()
20044 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_2_aligned()
20045 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_2_aligned()
20046 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_2_aligned()
20047 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_2_aligned()
20048 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_2_aligned()
20049 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_2_aligned()
20053 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_2_aligned()
20054 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_2_aligned()
20055 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_2_aligned()
20056 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_2_aligned()
20059 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_2_aligned()
20061 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_2_aligned()
20062 std::int32_t* result_chunk = result; in gemm_i32_1_1_2_aligned()
20063 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_2_aligned()
20064 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_2_aligned()
20107 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_1_3_aligned()
20108 std::int32_t n, std::int32_t k, in gemm_i32_1_1_3_aligned()
20109 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_1_3_aligned()
20110 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_1_3_aligned()
20111 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_3_aligned()
20112 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_3_aligned()
20113 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_3_aligned()
20114 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_3_aligned()
20115 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_3_aligned()
20116 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_3_aligned()
20120 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_3_aligned()
20121 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_3_aligned()
20122 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_3_aligned()
20123 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_3_aligned()
20126 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_3_aligned()
20128 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_3_aligned()
20129 std::int32_t* result_chunk = result; in gemm_i32_1_1_3_aligned()
20130 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_3_aligned()
20131 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_3_aligned()
20174 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_1_4_aligned()
20175 std::int32_t n, std::int32_t k, in gemm_i32_1_1_4_aligned()
20176 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_1_4_aligned()
20177 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_1_4_aligned()
20178 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_4_aligned()
20179 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_4_aligned()
20180 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_4_aligned()
20181 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_4_aligned()
20182 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_4_aligned()
20183 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_4_aligned()
20187 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_4_aligned()
20188 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_4_aligned()
20189 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_4_aligned()
20190 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_4_aligned()
20193 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_4_aligned()
20195 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_4_aligned()
20196 std::int32_t* result_chunk = result; in gemm_i32_1_1_4_aligned()
20197 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_4_aligned()
20198 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_4_aligned()
20241 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_1_5_aligned()
20242 std::int32_t n, std::int32_t k, in gemm_i32_1_1_5_aligned()
20243 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_1_5_aligned()
20244 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_1_5_aligned()
20245 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_5_aligned()
20246 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_5_aligned()
20247 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_5_aligned()
20248 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_5_aligned()
20249 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_5_aligned()
20250 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_5_aligned()
20254 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_5_aligned()
20255 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_5_aligned()
20256 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_5_aligned()
20257 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_5_aligned()
20260 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_5_aligned()
20262 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_5_aligned()
20263 std::int32_t* result_chunk = result; in gemm_i32_1_1_5_aligned()
20264 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_5_aligned()
20265 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_5_aligned()
20308 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_1_6_aligned()
20309 std::int32_t n, std::int32_t k, in gemm_i32_1_1_6_aligned()
20310 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_1_6_aligned()
20311 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_1_6_aligned()
20312 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_6_aligned()
20313 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_6_aligned()
20314 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_6_aligned()
20315 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_6_aligned()
20316 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_6_aligned()
20317 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_6_aligned()
20321 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_6_aligned()
20322 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_6_aligned()
20323 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_6_aligned()
20324 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_6_aligned()
20327 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_6_aligned()
20329 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_6_aligned()
20330 std::int32_t* result_chunk = result; in gemm_i32_1_1_6_aligned()
20331 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_6_aligned()
20332 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_6_aligned()
20375 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_1_7_aligned()
20376 std::int32_t n, std::int32_t k, in gemm_i32_1_1_7_aligned()
20377 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_1_7_aligned()
20378 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_1_7_aligned()
20379 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_7_aligned()
20380 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_7_aligned()
20381 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_7_aligned()
20382 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_7_aligned()
20383 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_7_aligned()
20384 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_7_aligned()
20388 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_7_aligned()
20389 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_7_aligned()
20390 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_7_aligned()
20391 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_7_aligned()
20394 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_7_aligned()
20396 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_7_aligned()
20397 std::int32_t* result_chunk = result; in gemm_i32_1_1_7_aligned()
20398 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_7_aligned()
20399 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_7_aligned()
20442 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_2_0_aligned()
20443 std::int32_t n, std::int32_t k, in gemm_i32_1_2_0_aligned()
20444 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_2_0_aligned()
20445 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_2_0_aligned()
20446 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_0_aligned()
20447 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_0_aligned()
20448 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_0_aligned()
20449 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_0_aligned()
20450 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_0_aligned()
20451 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_0_aligned()
20455 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_0_aligned()
20456 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_0_aligned()
20457 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_0_aligned()
20458 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_0_aligned()
20461 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_0_aligned()
20463 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_0_aligned()
20464 std::int32_t* result_chunk = result; in gemm_i32_1_2_0_aligned()
20465 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_0_aligned()
20466 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_0_aligned()
20509 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_2_1_aligned()
20510 std::int32_t n, std::int32_t k, in gemm_i32_1_2_1_aligned()
20511 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_2_1_aligned()
20512 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_2_1_aligned()
20513 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_1_aligned()
20514 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_1_aligned()
20515 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_1_aligned()
20516 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_1_aligned()
20517 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_1_aligned()
20518 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_1_aligned()
20522 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_1_aligned()
20523 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_1_aligned()
20524 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_1_aligned()
20525 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_1_aligned()
20528 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_1_aligned()
20530 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_1_aligned()
20531 std::int32_t* result_chunk = result; in gemm_i32_1_2_1_aligned()
20532 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_1_aligned()
20533 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_1_aligned()
20576 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_2_2_aligned()
20577 std::int32_t n, std::int32_t k, in gemm_i32_1_2_2_aligned()
20578 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_2_2_aligned()
20579 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_2_2_aligned()
20580 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_2_aligned()
20581 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_2_aligned()
20582 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_2_aligned()
20583 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_2_aligned()
20584 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_2_aligned()
20585 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_2_aligned()
20589 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_2_aligned()
20590 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_2_aligned()
20591 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_2_aligned()
20592 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_2_aligned()
20595 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_2_aligned()
20597 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_2_aligned()
20598 std::int32_t* result_chunk = result; in gemm_i32_1_2_2_aligned()
20599 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_2_aligned()
20600 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_2_aligned()
20643 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_2_3_aligned()
20644 std::int32_t n, std::int32_t k, in gemm_i32_1_2_3_aligned()
20645 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_2_3_aligned()
20646 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_2_3_aligned()
20647 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_3_aligned()
20648 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_3_aligned()
20649 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_3_aligned()
20650 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_3_aligned()
20651 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_3_aligned()
20652 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_3_aligned()
20656 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_3_aligned()
20657 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_3_aligned()
20658 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_3_aligned()
20659 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_3_aligned()
20662 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_3_aligned()
20664 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_3_aligned()
20665 std::int32_t* result_chunk = result; in gemm_i32_1_2_3_aligned()
20666 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_3_aligned()
20667 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_3_aligned()
20710 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_2_4_aligned()
20711 std::int32_t n, std::int32_t k, in gemm_i32_1_2_4_aligned()
20712 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_2_4_aligned()
20713 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_2_4_aligned()
20714 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_4_aligned()
20715 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_4_aligned()
20716 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_4_aligned()
20717 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_4_aligned()
20718 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_4_aligned()
20719 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_4_aligned()
20723 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_4_aligned()
20724 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_4_aligned()
20725 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_4_aligned()
20726 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_4_aligned()
20729 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_4_aligned()
20731 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_4_aligned()
20732 std::int32_t* result_chunk = result; in gemm_i32_1_2_4_aligned()
20733 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_4_aligned()
20734 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_4_aligned()
20777 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_2_5_aligned()
20778 std::int32_t n, std::int32_t k, in gemm_i32_1_2_5_aligned()
20779 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_2_5_aligned()
20780 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_2_5_aligned()
20781 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_5_aligned()
20782 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_5_aligned()
20783 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_5_aligned()
20784 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_5_aligned()
20785 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_5_aligned()
20786 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_5_aligned()
20790 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_5_aligned()
20791 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_5_aligned()
20792 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_5_aligned()
20793 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_5_aligned()
20796 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_5_aligned()
20798 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_5_aligned()
20799 std::int32_t* result_chunk = result; in gemm_i32_1_2_5_aligned()
20800 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_5_aligned()
20801 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_5_aligned()
20844 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_2_6_aligned()
20845 std::int32_t n, std::int32_t k, in gemm_i32_1_2_6_aligned()
20846 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_2_6_aligned()
20847 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_2_6_aligned()
20848 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_6_aligned()
20849 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_6_aligned()
20850 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_6_aligned()
20851 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_6_aligned()
20852 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_6_aligned()
20853 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_6_aligned()
20857 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_6_aligned()
20858 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_6_aligned()
20859 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_6_aligned()
20860 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_6_aligned()
20863 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_6_aligned()
20865 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_6_aligned()
20866 std::int32_t* result_chunk = result; in gemm_i32_1_2_6_aligned()
20867 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_6_aligned()
20868 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_6_aligned()
20911 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_1_2_7_aligned()
20912 std::int32_t n, std::int32_t k, in gemm_i32_1_2_7_aligned()
20913 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_1_2_7_aligned()
20914 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_1_2_7_aligned()
20915 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_7_aligned()
20916 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_7_aligned()
20917 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_7_aligned()
20918 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_7_aligned()
20919 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_7_aligned()
20920 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_7_aligned()
20924 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_7_aligned()
20925 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_7_aligned()
20926 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_7_aligned()
20927 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_7_aligned()
20930 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_7_aligned()
20932 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_7_aligned()
20933 std::int32_t* result_chunk = result; in gemm_i32_1_2_7_aligned()
20934 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_7_aligned()
20935 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_7_aligned()
20978 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_0_0_aligned()
20979 std::int32_t n, std::int32_t k, in gemm_i32_2_0_0_aligned()
20980 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_0_0_aligned()
20981 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_0_0_aligned()
20982 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_0_aligned()
20983 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_0_aligned()
20984 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_0_aligned()
20985 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_0_aligned()
20986 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_0_aligned()
20987 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_0_aligned()
20991 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_0_aligned()
20992 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_0_aligned()
20993 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_0_aligned()
20994 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_0_aligned()
20997 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_0_aligned()
20999 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_0_aligned()
21000 std::int32_t* result_chunk = result; in gemm_i32_2_0_0_aligned()
21001 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_0_aligned()
21002 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_0_aligned()
21038 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_0_1_aligned()
21039 std::int32_t n, std::int32_t k, in gemm_i32_2_0_1_aligned()
21040 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_0_1_aligned()
21041 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_0_1_aligned()
21042 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_1_aligned()
21043 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_1_aligned()
21044 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_1_aligned()
21045 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_1_aligned()
21046 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_1_aligned()
21047 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_1_aligned()
21051 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_1_aligned()
21052 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_1_aligned()
21053 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_1_aligned()
21054 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_1_aligned()
21057 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_1_aligned()
21059 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_1_aligned()
21060 std::int32_t* result_chunk = result; in gemm_i32_2_0_1_aligned()
21061 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_1_aligned()
21062 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_1_aligned()
21098 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_0_2_aligned()
21099 std::int32_t n, std::int32_t k, in gemm_i32_2_0_2_aligned()
21100 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_0_2_aligned()
21101 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_0_2_aligned()
21102 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_2_aligned()
21103 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_2_aligned()
21104 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_2_aligned()
21105 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_2_aligned()
21106 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_2_aligned()
21107 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_2_aligned()
21111 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_2_aligned()
21112 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_2_aligned()
21113 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_2_aligned()
21114 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_2_aligned()
21117 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_2_aligned()
21119 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_2_aligned()
21120 std::int32_t* result_chunk = result; in gemm_i32_2_0_2_aligned()
21121 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_2_aligned()
21122 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_2_aligned()
21158 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_0_3_aligned()
21159 std::int32_t n, std::int32_t k, in gemm_i32_2_0_3_aligned()
21160 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_0_3_aligned()
21161 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_0_3_aligned()
21162 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_3_aligned()
21163 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_3_aligned()
21164 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_3_aligned()
21165 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_3_aligned()
21166 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_3_aligned()
21167 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_3_aligned()
21171 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_3_aligned()
21172 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_3_aligned()
21173 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_3_aligned()
21174 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_3_aligned()
21177 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_3_aligned()
21179 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_3_aligned()
21180 std::int32_t* result_chunk = result; in gemm_i32_2_0_3_aligned()
21181 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_3_aligned()
21182 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_3_aligned()
21218 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_0_4_aligned()
21219 std::int32_t n, std::int32_t k, in gemm_i32_2_0_4_aligned()
21220 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_0_4_aligned()
21221 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_0_4_aligned()
21222 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_4_aligned()
21223 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_4_aligned()
21224 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_4_aligned()
21225 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_4_aligned()
21226 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_4_aligned()
21227 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_4_aligned()
21231 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_4_aligned()
21232 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_4_aligned()
21233 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_4_aligned()
21234 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_4_aligned()
21237 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_4_aligned()
21239 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_4_aligned()
21240 std::int32_t* result_chunk = result; in gemm_i32_2_0_4_aligned()
21241 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_4_aligned()
21242 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_4_aligned()
21278 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_0_5_aligned()
21279 std::int32_t n, std::int32_t k, in gemm_i32_2_0_5_aligned()
21280 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_0_5_aligned()
21281 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_0_5_aligned()
21282 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_5_aligned()
21283 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_5_aligned()
21284 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_5_aligned()
21285 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_5_aligned()
21286 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_5_aligned()
21287 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_5_aligned()
21291 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_5_aligned()
21292 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_5_aligned()
21293 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_5_aligned()
21294 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_5_aligned()
21297 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_5_aligned()
21299 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_5_aligned()
21300 std::int32_t* result_chunk = result; in gemm_i32_2_0_5_aligned()
21301 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_5_aligned()
21302 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_5_aligned()
21338 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_0_6_aligned()
21339 std::int32_t n, std::int32_t k, in gemm_i32_2_0_6_aligned()
21340 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_0_6_aligned()
21341 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_0_6_aligned()
21342 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_6_aligned()
21343 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_6_aligned()
21344 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_6_aligned()
21345 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_6_aligned()
21346 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_6_aligned()
21347 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_6_aligned()
21351 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_6_aligned()
21352 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_6_aligned()
21353 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_6_aligned()
21354 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_6_aligned()
21357 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_6_aligned()
21359 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_6_aligned()
21360 std::int32_t* result_chunk = result; in gemm_i32_2_0_6_aligned()
21361 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_6_aligned()
21362 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_6_aligned()
21398 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_0_7_aligned()
21399 std::int32_t n, std::int32_t k, in gemm_i32_2_0_7_aligned()
21400 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_0_7_aligned()
21401 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_0_7_aligned()
21402 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_7_aligned()
21403 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_7_aligned()
21404 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_7_aligned()
21405 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_7_aligned()
21406 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_7_aligned()
21407 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_7_aligned()
21411 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_7_aligned()
21412 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_7_aligned()
21413 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_7_aligned()
21414 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_7_aligned()
21417 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_7_aligned()
21419 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_7_aligned()
21420 std::int32_t* result_chunk = result; in gemm_i32_2_0_7_aligned()
21421 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_7_aligned()
21422 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_7_aligned()
21458 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_1_0_aligned()
21459 std::int32_t n, std::int32_t k, in gemm_i32_2_1_0_aligned()
21460 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_1_0_aligned()
21461 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_1_0_aligned()
21462 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_0_aligned()
21463 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_0_aligned()
21464 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_0_aligned()
21465 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_0_aligned()
21466 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_0_aligned()
21467 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_0_aligned()
21471 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_0_aligned()
21472 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_0_aligned()
21473 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_0_aligned()
21474 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_0_aligned()
21477 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_0_aligned()
21479 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_0_aligned()
21480 std::int32_t* result_chunk = result; in gemm_i32_2_1_0_aligned()
21481 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_0_aligned()
21482 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_0_aligned()
21525 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_1_1_aligned()
21526 std::int32_t n, std::int32_t k, in gemm_i32_2_1_1_aligned()
21527 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_1_1_aligned()
21528 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_1_1_aligned()
21529 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_1_aligned()
21530 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_1_aligned()
21531 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_1_aligned()
21532 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_1_aligned()
21533 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_1_aligned()
21534 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_1_aligned()
21538 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_1_aligned()
21539 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_1_aligned()
21540 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_1_aligned()
21541 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_1_aligned()
21544 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_1_aligned()
21546 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_1_aligned()
21547 std::int32_t* result_chunk = result; in gemm_i32_2_1_1_aligned()
21548 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_1_aligned()
21549 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_1_aligned()
21592 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_1_2_aligned()
21593 std::int32_t n, std::int32_t k, in gemm_i32_2_1_2_aligned()
21594 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_1_2_aligned()
21595 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_1_2_aligned()
21596 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_2_aligned()
21597 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_2_aligned()
21598 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_2_aligned()
21599 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_2_aligned()
21600 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_2_aligned()
21601 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_2_aligned()
21605 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_2_aligned()
21606 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_2_aligned()
21607 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_2_aligned()
21608 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_2_aligned()
21611 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_2_aligned()
21613 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_2_aligned()
21614 std::int32_t* result_chunk = result; in gemm_i32_2_1_2_aligned()
21615 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_2_aligned()
21616 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_2_aligned()
21659 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_1_3_aligned()
21660 std::int32_t n, std::int32_t k, in gemm_i32_2_1_3_aligned()
21661 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_1_3_aligned()
21662 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_1_3_aligned()
21663 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_3_aligned()
21664 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_3_aligned()
21665 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_3_aligned()
21666 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_3_aligned()
21667 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_3_aligned()
21668 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_3_aligned()
21672 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_3_aligned()
21673 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_3_aligned()
21674 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_3_aligned()
21675 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_3_aligned()
21678 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_3_aligned()
21680 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_3_aligned()
21681 std::int32_t* result_chunk = result; in gemm_i32_2_1_3_aligned()
21682 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_3_aligned()
21683 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_3_aligned()
21726 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_1_4_aligned()
21727 std::int32_t n, std::int32_t k, in gemm_i32_2_1_4_aligned()
21728 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_1_4_aligned()
21729 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_1_4_aligned()
21730 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_4_aligned()
21731 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_4_aligned()
21732 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_4_aligned()
21733 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_4_aligned()
21734 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_4_aligned()
21735 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_4_aligned()
21739 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_4_aligned()
21740 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_4_aligned()
21741 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_4_aligned()
21742 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_4_aligned()
21745 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_4_aligned()
21747 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_4_aligned()
21748 std::int32_t* result_chunk = result; in gemm_i32_2_1_4_aligned()
21749 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_4_aligned()
21750 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_4_aligned()
21793 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_1_5_aligned()
21794 std::int32_t n, std::int32_t k, in gemm_i32_2_1_5_aligned()
21795 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_1_5_aligned()
21796 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_1_5_aligned()
21797 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_5_aligned()
21798 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_5_aligned()
21799 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_5_aligned()
21800 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_5_aligned()
21801 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_5_aligned()
21802 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_5_aligned()
21806 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_5_aligned()
21807 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_5_aligned()
21808 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_5_aligned()
21809 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_5_aligned()
21812 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_5_aligned()
21814 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_5_aligned()
21815 std::int32_t* result_chunk = result; in gemm_i32_2_1_5_aligned()
21816 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_5_aligned()
21817 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_5_aligned()
21860 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_1_6_aligned()
21861 std::int32_t n, std::int32_t k, in gemm_i32_2_1_6_aligned()
21862 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_1_6_aligned()
21863 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_1_6_aligned()
21864 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_6_aligned()
21865 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_6_aligned()
21866 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_6_aligned()
21867 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_6_aligned()
21868 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_6_aligned()
21869 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_6_aligned()
21873 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_6_aligned()
21874 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_6_aligned()
21875 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_6_aligned()
21876 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_6_aligned()
21879 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_6_aligned()
21881 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_6_aligned()
21882 std::int32_t* result_chunk = result; in gemm_i32_2_1_6_aligned()
21883 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_6_aligned()
21884 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_6_aligned()
21927 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_1_7_aligned()
21928 std::int32_t n, std::int32_t k, in gemm_i32_2_1_7_aligned()
21929 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_1_7_aligned()
21930 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_1_7_aligned()
21931 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_7_aligned()
21932 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_7_aligned()
21933 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_7_aligned()
21934 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_7_aligned()
21935 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_7_aligned()
21936 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_7_aligned()
21940 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_7_aligned()
21941 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_7_aligned()
21942 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_7_aligned()
21943 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_7_aligned()
21946 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_7_aligned()
21948 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_7_aligned()
21949 std::int32_t* result_chunk = result; in gemm_i32_2_1_7_aligned()
21950 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_7_aligned()
21951 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_7_aligned()
21994 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_2_0_aligned()
21995 std::int32_t n, std::int32_t k, in gemm_i32_2_2_0_aligned()
21996 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_2_0_aligned()
21997 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_2_0_aligned()
21998 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_0_aligned()
21999 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_0_aligned()
22000 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_0_aligned()
22001 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_0_aligned()
22002 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_0_aligned()
22003 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_0_aligned()
22007 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_0_aligned()
22008 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_0_aligned()
22009 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_0_aligned()
22010 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_0_aligned()
22013 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_0_aligned()
22015 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_0_aligned()
22016 std::int32_t* result_chunk = result; in gemm_i32_2_2_0_aligned()
22017 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_0_aligned()
22018 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_0_aligned()
22061 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_2_1_aligned()
22062 std::int32_t n, std::int32_t k, in gemm_i32_2_2_1_aligned()
22063 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_2_1_aligned()
22064 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_2_1_aligned()
22065 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_1_aligned()
22066 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_1_aligned()
22067 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_1_aligned()
22068 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_1_aligned()
22069 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_1_aligned()
22070 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_1_aligned()
22074 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_1_aligned()
22075 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_1_aligned()
22076 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_1_aligned()
22077 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_1_aligned()
22080 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_1_aligned()
22082 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_1_aligned()
22083 std::int32_t* result_chunk = result; in gemm_i32_2_2_1_aligned()
22084 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_1_aligned()
22085 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_1_aligned()
22128 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_2_2_aligned()
22129 std::int32_t n, std::int32_t k, in gemm_i32_2_2_2_aligned()
22130 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_2_2_aligned()
22131 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_2_2_aligned()
22132 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_2_aligned()
22133 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_2_aligned()
22134 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_2_aligned()
22135 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_2_aligned()
22136 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_2_aligned()
22137 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_2_aligned()
22141 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_2_aligned()
22142 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_2_aligned()
22143 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_2_aligned()
22144 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_2_aligned()
22147 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_2_aligned()
22149 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_2_aligned()
22150 std::int32_t* result_chunk = result; in gemm_i32_2_2_2_aligned()
22151 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_2_aligned()
22152 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_2_aligned()
22195 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_2_3_aligned()
22196 std::int32_t n, std::int32_t k, in gemm_i32_2_2_3_aligned()
22197 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_2_3_aligned()
22198 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_2_3_aligned()
22199 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_3_aligned()
22200 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_3_aligned()
22201 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_3_aligned()
22202 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_3_aligned()
22203 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_3_aligned()
22204 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_3_aligned()
22208 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_3_aligned()
22209 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_3_aligned()
22210 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_3_aligned()
22211 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_3_aligned()
22214 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_3_aligned()
22216 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_3_aligned()
22217 std::int32_t* result_chunk = result; in gemm_i32_2_2_3_aligned()
22218 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_3_aligned()
22219 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_3_aligned()
22262 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_2_4_aligned()
22263 std::int32_t n, std::int32_t k, in gemm_i32_2_2_4_aligned()
22264 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_2_4_aligned()
22265 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_2_4_aligned()
22266 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_4_aligned()
22267 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_4_aligned()
22268 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_4_aligned()
22269 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_4_aligned()
22270 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_4_aligned()
22271 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_4_aligned()
22275 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_4_aligned()
22276 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_4_aligned()
22277 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_4_aligned()
22278 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_4_aligned()
22281 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_4_aligned()
22283 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_4_aligned()
22284 std::int32_t* result_chunk = result; in gemm_i32_2_2_4_aligned()
22285 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_4_aligned()
22286 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_4_aligned()
22329 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_2_5_aligned()
22330 std::int32_t n, std::int32_t k, in gemm_i32_2_2_5_aligned()
22331 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_2_5_aligned()
22332 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_2_5_aligned()
22333 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_5_aligned()
22334 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_5_aligned()
22335 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_5_aligned()
22336 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_5_aligned()
22337 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_5_aligned()
22338 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_5_aligned()
22342 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_5_aligned()
22343 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_5_aligned()
22344 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_5_aligned()
22345 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_5_aligned()
22348 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_5_aligned()
22350 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_5_aligned()
22351 std::int32_t* result_chunk = result; in gemm_i32_2_2_5_aligned()
22352 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_5_aligned()
22353 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_5_aligned()
22396 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_2_6_aligned()
22397 std::int32_t n, std::int32_t k, in gemm_i32_2_2_6_aligned()
22398 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_2_6_aligned()
22399 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_2_6_aligned()
22400 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_6_aligned()
22401 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_6_aligned()
22402 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_6_aligned()
22403 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_6_aligned()
22404 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_6_aligned()
22405 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_6_aligned()
22409 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_6_aligned()
22410 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_6_aligned()
22411 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_6_aligned()
22412 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_6_aligned()
22415 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_6_aligned()
22417 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_6_aligned()
22418 std::int32_t* result_chunk = result; in gemm_i32_2_2_6_aligned()
22419 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_6_aligned()
22420 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_6_aligned()
22463 const std::uint8_t* rhs, std::int32_t m, in gemm_i32_2_2_7_aligned()
22464 std::int32_t n, std::int32_t k, in gemm_i32_2_2_7_aligned()
22465 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32_2_2_7_aligned()
22466 std::int32_t* result, std::int32_t result_stride) { in gemm_i32_2_2_7_aligned()
22467 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_7_aligned()
22468 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_7_aligned()
22469 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_7_aligned()
22470 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_7_aligned()
22471 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_7_aligned()
22472 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_7_aligned()
22476 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_7_aligned()
22477 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_7_aligned()
22478 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_7_aligned()
22479 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_7_aligned()
22482 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_7_aligned()
22484 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_7_aligned()
22485 std::int32_t* result_chunk = result; in gemm_i32_2_2_7_aligned()
22486 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_7_aligned()
22487 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_7_aligned()
22530 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_0_0()
22531 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_0_0()
22532 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_0_0()
22533 std::int32_t result_stride) { in gemm_i32_0_0_0()
22534 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_0()
22535 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_0()
22536 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_0()
22537 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_0()
22538 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_0()
22539 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_0()
22543 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_0()
22544 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_0()
22547 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_0()
22549 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_0()
22550 std::int32_t* result_chunk = result; in gemm_i32_0_0_0()
22551 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_0()
22552 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_0()
22577 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_0_1()
22578 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_0_1()
22579 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_0_1()
22580 std::int32_t result_stride) { in gemm_i32_0_0_1()
22581 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_1()
22582 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_1()
22583 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_1()
22584 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_1()
22585 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_1()
22586 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_1()
22590 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_1()
22591 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_1()
22594 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_1()
22596 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_1()
22597 std::int32_t* result_chunk = result; in gemm_i32_0_0_1()
22598 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_1()
22599 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_1()
22624 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_0_2()
22625 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_0_2()
22626 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_0_2()
22627 std::int32_t result_stride) { in gemm_i32_0_0_2()
22628 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_2()
22629 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_2()
22630 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_2()
22631 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_2()
22632 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_2()
22633 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_2()
22637 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_2()
22638 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_2()
22641 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_2()
22643 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_2()
22644 std::int32_t* result_chunk = result; in gemm_i32_0_0_2()
22645 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_2()
22646 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_2()
22671 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_0_3()
22672 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_0_3()
22673 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_0_3()
22674 std::int32_t result_stride) { in gemm_i32_0_0_3()
22675 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_3()
22676 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_3()
22677 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_3()
22678 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_3()
22679 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_3()
22680 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_3()
22684 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_3()
22685 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_3()
22688 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_3()
22690 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_3()
22691 std::int32_t* result_chunk = result; in gemm_i32_0_0_3()
22692 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_3()
22693 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_3()
22718 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_0_4()
22719 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_0_4()
22720 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_0_4()
22721 std::int32_t result_stride) { in gemm_i32_0_0_4()
22722 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_4()
22723 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_4()
22724 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_4()
22725 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_4()
22726 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_4()
22727 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_4()
22731 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_4()
22732 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_4()
22735 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_4()
22737 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_4()
22738 std::int32_t* result_chunk = result; in gemm_i32_0_0_4()
22739 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_4()
22740 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_4()
22765 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_0_5()
22766 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_0_5()
22767 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_0_5()
22768 std::int32_t result_stride) { in gemm_i32_0_0_5()
22769 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_5()
22770 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_5()
22771 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_5()
22772 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_5()
22773 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_5()
22774 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_5()
22778 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_5()
22779 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_5()
22782 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_5()
22784 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_5()
22785 std::int32_t* result_chunk = result; in gemm_i32_0_0_5()
22786 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_5()
22787 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_5()
22812 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_0_6()
22813 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_0_6()
22814 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_0_6()
22815 std::int32_t result_stride) { in gemm_i32_0_0_6()
22816 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_6()
22817 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_6()
22818 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_6()
22819 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_6()
22820 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_6()
22821 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_6()
22825 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_6()
22826 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_6()
22829 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_6()
22831 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_6()
22832 std::int32_t* result_chunk = result; in gemm_i32_0_0_6()
22833 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_6()
22834 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_6()
22859 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_0_7()
22860 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_0_7()
22861 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_0_7()
22862 std::int32_t result_stride) { in gemm_i32_0_0_7()
22863 const std::int32_t row_chunks = m / 3; in gemm_i32_0_0_7()
22864 const std::int32_t col_chunks = n / 3; in gemm_i32_0_0_7()
22865 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_0_7()
22866 const std::int32_t chunk_size = k * 3; in gemm_i32_0_0_7()
22867 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_0_7()
22868 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_0_7()
22872 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_0_7()
22873 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_0_7()
22876 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_0_7()
22878 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_0_7()
22879 std::int32_t* result_chunk = result; in gemm_i32_0_0_7()
22880 std::int32_t* mul_result_chunk = result; in gemm_i32_0_0_7()
22881 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_0_7()
22906 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_1_0()
22907 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_1_0()
22908 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_1_0()
22909 std::int32_t result_stride) { in gemm_i32_0_1_0()
22910 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_0()
22911 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_0()
22912 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_0()
22913 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_0()
22914 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_0()
22915 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_0()
22919 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_0()
22920 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_0()
22923 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_0()
22925 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_0()
22926 std::int32_t* result_chunk = result; in gemm_i32_0_1_0()
22927 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_0()
22928 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_0()
22957 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_1_1()
22958 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_1_1()
22959 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_1_1()
22960 std::int32_t result_stride) { in gemm_i32_0_1_1()
22961 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_1()
22962 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_1()
22963 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_1()
22964 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_1()
22965 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_1()
22966 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_1()
22970 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_1()
22971 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_1()
22974 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_1()
22976 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_1()
22977 std::int32_t* result_chunk = result; in gemm_i32_0_1_1()
22978 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_1()
22979 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_1()
23008 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_1_2()
23009 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_1_2()
23010 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_1_2()
23011 std::int32_t result_stride) { in gemm_i32_0_1_2()
23012 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_2()
23013 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_2()
23014 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_2()
23015 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_2()
23016 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_2()
23017 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_2()
23021 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_2()
23022 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_2()
23025 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_2()
23027 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_2()
23028 std::int32_t* result_chunk = result; in gemm_i32_0_1_2()
23029 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_2()
23030 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_2()
23059 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_1_3()
23060 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_1_3()
23061 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_1_3()
23062 std::int32_t result_stride) { in gemm_i32_0_1_3()
23063 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_3()
23064 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_3()
23065 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_3()
23066 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_3()
23067 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_3()
23068 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_3()
23072 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_3()
23073 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_3()
23076 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_3()
23078 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_3()
23079 std::int32_t* result_chunk = result; in gemm_i32_0_1_3()
23080 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_3()
23081 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_3()
23110 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_1_4()
23111 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_1_4()
23112 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_1_4()
23113 std::int32_t result_stride) { in gemm_i32_0_1_4()
23114 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_4()
23115 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_4()
23116 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_4()
23117 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_4()
23118 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_4()
23119 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_4()
23123 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_4()
23124 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_4()
23127 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_4()
23129 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_4()
23130 std::int32_t* result_chunk = result; in gemm_i32_0_1_4()
23131 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_4()
23132 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_4()
23161 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_1_5()
23162 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_1_5()
23163 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_1_5()
23164 std::int32_t result_stride) { in gemm_i32_0_1_5()
23165 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_5()
23166 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_5()
23167 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_5()
23168 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_5()
23169 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_5()
23170 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_5()
23174 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_5()
23175 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_5()
23178 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_5()
23180 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_5()
23181 std::int32_t* result_chunk = result; in gemm_i32_0_1_5()
23182 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_5()
23183 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_5()
23212 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_1_6()
23213 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_1_6()
23214 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_1_6()
23215 std::int32_t result_stride) { in gemm_i32_0_1_6()
23216 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_6()
23217 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_6()
23218 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_6()
23219 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_6()
23220 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_6()
23221 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_6()
23225 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_6()
23226 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_6()
23229 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_6()
23231 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_6()
23232 std::int32_t* result_chunk = result; in gemm_i32_0_1_6()
23233 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_6()
23234 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_6()
23263 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_1_7()
23264 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_1_7()
23265 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_1_7()
23266 std::int32_t result_stride) { in gemm_i32_0_1_7()
23267 const std::int32_t row_chunks = m / 3; in gemm_i32_0_1_7()
23268 const std::int32_t col_chunks = n / 3; in gemm_i32_0_1_7()
23269 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_1_7()
23270 const std::int32_t chunk_size = k * 3; in gemm_i32_0_1_7()
23271 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_1_7()
23272 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_1_7()
23276 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_1_7()
23277 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_1_7()
23280 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_1_7()
23282 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_1_7()
23283 std::int32_t* result_chunk = result; in gemm_i32_0_1_7()
23284 std::int32_t* mul_result_chunk = result; in gemm_i32_0_1_7()
23285 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_1_7()
23314 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_2_0()
23315 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_2_0()
23316 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_2_0()
23317 std::int32_t result_stride) { in gemm_i32_0_2_0()
23318 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_0()
23319 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_0()
23320 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_0()
23321 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_0()
23322 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_0()
23323 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_0()
23327 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_0()
23328 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_0()
23331 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_0()
23333 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_0()
23334 std::int32_t* result_chunk = result; in gemm_i32_0_2_0()
23335 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_0()
23336 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_0()
23365 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_2_1()
23366 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_2_1()
23367 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_2_1()
23368 std::int32_t result_stride) { in gemm_i32_0_2_1()
23369 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_1()
23370 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_1()
23371 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_1()
23372 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_1()
23373 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_1()
23374 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_1()
23378 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_1()
23379 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_1()
23382 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_1()
23384 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_1()
23385 std::int32_t* result_chunk = result; in gemm_i32_0_2_1()
23386 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_1()
23387 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_1()
23416 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_2_2()
23417 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_2_2()
23418 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_2_2()
23419 std::int32_t result_stride) { in gemm_i32_0_2_2()
23420 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_2()
23421 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_2()
23422 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_2()
23423 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_2()
23424 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_2()
23425 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_2()
23429 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_2()
23430 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_2()
23433 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_2()
23435 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_2()
23436 std::int32_t* result_chunk = result; in gemm_i32_0_2_2()
23437 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_2()
23438 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_2()
23467 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_2_3()
23468 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_2_3()
23469 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_2_3()
23470 std::int32_t result_stride) { in gemm_i32_0_2_3()
23471 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_3()
23472 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_3()
23473 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_3()
23474 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_3()
23475 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_3()
23476 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_3()
23480 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_3()
23481 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_3()
23484 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_3()
23486 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_3()
23487 std::int32_t* result_chunk = result; in gemm_i32_0_2_3()
23488 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_3()
23489 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_3()
23518 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_2_4()
23519 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_2_4()
23520 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_2_4()
23521 std::int32_t result_stride) { in gemm_i32_0_2_4()
23522 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_4()
23523 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_4()
23524 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_4()
23525 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_4()
23526 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_4()
23527 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_4()
23531 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_4()
23532 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_4()
23535 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_4()
23537 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_4()
23538 std::int32_t* result_chunk = result; in gemm_i32_0_2_4()
23539 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_4()
23540 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_4()
23569 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_2_5()
23570 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_2_5()
23571 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_2_5()
23572 std::int32_t result_stride) { in gemm_i32_0_2_5()
23573 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_5()
23574 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_5()
23575 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_5()
23576 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_5()
23577 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_5()
23578 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_5()
23582 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_5()
23583 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_5()
23586 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_5()
23588 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_5()
23589 std::int32_t* result_chunk = result; in gemm_i32_0_2_5()
23590 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_5()
23591 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_5()
23620 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_2_6()
23621 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_2_6()
23622 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_2_6()
23623 std::int32_t result_stride) { in gemm_i32_0_2_6()
23624 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_6()
23625 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_6()
23626 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_6()
23627 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_6()
23628 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_6()
23629 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_6()
23633 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_6()
23634 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_6()
23637 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_6()
23639 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_6()
23640 std::int32_t* result_chunk = result; in gemm_i32_0_2_6()
23641 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_6()
23642 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_6()
23671 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_0_2_7()
23672 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_0_2_7()
23673 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_0_2_7()
23674 std::int32_t result_stride) { in gemm_i32_0_2_7()
23675 const std::int32_t row_chunks = m / 3; in gemm_i32_0_2_7()
23676 const std::int32_t col_chunks = n / 3; in gemm_i32_0_2_7()
23677 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_0_2_7()
23678 const std::int32_t chunk_size = k * 3; in gemm_i32_0_2_7()
23679 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_0_2_7()
23680 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_0_2_7()
23684 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_0_2_7()
23685 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_0_2_7()
23688 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_0_2_7()
23690 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_0_2_7()
23691 std::int32_t* result_chunk = result; in gemm_i32_0_2_7()
23692 std::int32_t* mul_result_chunk = result; in gemm_i32_0_2_7()
23693 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_0_2_7()
23722 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_0_0()
23723 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_0_0()
23724 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_0_0()
23725 std::int32_t result_stride) { in gemm_i32_1_0_0()
23726 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_0()
23727 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_0()
23728 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_0()
23729 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_0()
23730 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_0()
23731 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_0()
23735 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_0()
23736 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_0()
23737 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_0()
23738 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_0()
23741 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_0()
23743 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_0()
23744 std::int32_t* result_chunk = result; in gemm_i32_1_0_0()
23745 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_0()
23746 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_0()
23782 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_0_1()
23783 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_0_1()
23784 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_0_1()
23785 std::int32_t result_stride) { in gemm_i32_1_0_1()
23786 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_1()
23787 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_1()
23788 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_1()
23789 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_1()
23790 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_1()
23791 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_1()
23795 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_1()
23796 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_1()
23797 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_1()
23798 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_1()
23801 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_1()
23803 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_1()
23804 std::int32_t* result_chunk = result; in gemm_i32_1_0_1()
23805 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_1()
23806 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_1()
23842 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_0_2()
23843 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_0_2()
23844 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_0_2()
23845 std::int32_t result_stride) { in gemm_i32_1_0_2()
23846 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_2()
23847 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_2()
23848 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_2()
23849 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_2()
23850 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_2()
23851 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_2()
23855 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_2()
23856 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_2()
23857 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_2()
23858 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_2()
23861 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_2()
23863 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_2()
23864 std::int32_t* result_chunk = result; in gemm_i32_1_0_2()
23865 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_2()
23866 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_2()
23902 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_0_3()
23903 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_0_3()
23904 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_0_3()
23905 std::int32_t result_stride) { in gemm_i32_1_0_3()
23906 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_3()
23907 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_3()
23908 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_3()
23909 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_3()
23910 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_3()
23911 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_3()
23915 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_3()
23916 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_3()
23917 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_3()
23918 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_3()
23921 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_3()
23923 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_3()
23924 std::int32_t* result_chunk = result; in gemm_i32_1_0_3()
23925 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_3()
23926 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_3()
23962 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_0_4()
23963 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_0_4()
23964 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_0_4()
23965 std::int32_t result_stride) { in gemm_i32_1_0_4()
23966 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_4()
23967 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_4()
23968 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_4()
23969 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_4()
23970 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_4()
23971 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_4()
23975 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_4()
23976 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_4()
23977 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_4()
23978 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_4()
23981 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_4()
23983 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_4()
23984 std::int32_t* result_chunk = result; in gemm_i32_1_0_4()
23985 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_4()
23986 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_4()
24022 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_0_5()
24023 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_0_5()
24024 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_0_5()
24025 std::int32_t result_stride) { in gemm_i32_1_0_5()
24026 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_5()
24027 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_5()
24028 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_5()
24029 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_5()
24030 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_5()
24031 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_5()
24035 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_5()
24036 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_5()
24037 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_5()
24038 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_5()
24041 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_5()
24043 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_5()
24044 std::int32_t* result_chunk = result; in gemm_i32_1_0_5()
24045 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_5()
24046 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_5()
24082 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_0_6()
24083 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_0_6()
24084 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_0_6()
24085 std::int32_t result_stride) { in gemm_i32_1_0_6()
24086 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_6()
24087 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_6()
24088 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_6()
24089 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_6()
24090 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_6()
24091 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_6()
24095 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_6()
24096 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_6()
24097 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_6()
24098 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_6()
24101 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_6()
24103 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_6()
24104 std::int32_t* result_chunk = result; in gemm_i32_1_0_6()
24105 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_6()
24106 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_6()
24142 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_0_7()
24143 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_0_7()
24144 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_0_7()
24145 std::int32_t result_stride) { in gemm_i32_1_0_7()
24146 const std::int32_t row_chunks = m / 3; in gemm_i32_1_0_7()
24147 const std::int32_t col_chunks = n / 3; in gemm_i32_1_0_7()
24148 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_0_7()
24149 const std::int32_t chunk_size = k * 3; in gemm_i32_1_0_7()
24150 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_0_7()
24151 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_0_7()
24155 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_0_7()
24156 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_0_7()
24157 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_0_7()
24158 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_0_7()
24161 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_0_7()
24163 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_0_7()
24164 std::int32_t* result_chunk = result; in gemm_i32_1_0_7()
24165 std::int32_t* mul_result_chunk = result; in gemm_i32_1_0_7()
24166 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_0_7()
24202 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_1_0()
24203 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_1_0()
24204 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_1_0()
24205 std::int32_t result_stride) { in gemm_i32_1_1_0()
24206 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_0()
24207 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_0()
24208 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_0()
24209 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_0()
24210 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_0()
24211 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_0()
24215 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_0()
24216 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_0()
24217 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_0()
24218 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_0()
24221 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_0()
24223 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_0()
24224 std::int32_t* result_chunk = result; in gemm_i32_1_1_0()
24225 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_0()
24226 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_0()
24269 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_1_1()
24270 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_1_1()
24271 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_1_1()
24272 std::int32_t result_stride) { in gemm_i32_1_1_1()
24273 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_1()
24274 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_1()
24275 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_1()
24276 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_1()
24277 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_1()
24278 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_1()
24282 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_1()
24283 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_1()
24284 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_1()
24285 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_1()
24288 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_1()
24290 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_1()
24291 std::int32_t* result_chunk = result; in gemm_i32_1_1_1()
24292 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_1()
24293 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_1()
24336 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_1_2()
24337 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_1_2()
24338 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_1_2()
24339 std::int32_t result_stride) { in gemm_i32_1_1_2()
24340 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_2()
24341 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_2()
24342 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_2()
24343 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_2()
24344 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_2()
24345 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_2()
24349 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_2()
24350 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_2()
24351 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_2()
24352 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_2()
24355 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_2()
24357 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_2()
24358 std::int32_t* result_chunk = result; in gemm_i32_1_1_2()
24359 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_2()
24360 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_2()
24403 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_1_3()
24404 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_1_3()
24405 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_1_3()
24406 std::int32_t result_stride) { in gemm_i32_1_1_3()
24407 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_3()
24408 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_3()
24409 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_3()
24410 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_3()
24411 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_3()
24412 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_3()
24416 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_3()
24417 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_3()
24418 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_3()
24419 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_3()
24422 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_3()
24424 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_3()
24425 std::int32_t* result_chunk = result; in gemm_i32_1_1_3()
24426 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_3()
24427 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_3()
24470 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_1_4()
24471 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_1_4()
24472 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_1_4()
24473 std::int32_t result_stride) { in gemm_i32_1_1_4()
24474 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_4()
24475 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_4()
24476 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_4()
24477 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_4()
24478 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_4()
24479 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_4()
24483 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_4()
24484 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_4()
24485 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_4()
24486 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_4()
24489 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_4()
24491 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_4()
24492 std::int32_t* result_chunk = result; in gemm_i32_1_1_4()
24493 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_4()
24494 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_4()
24537 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_1_5()
24538 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_1_5()
24539 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_1_5()
24540 std::int32_t result_stride) { in gemm_i32_1_1_5()
24541 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_5()
24542 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_5()
24543 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_5()
24544 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_5()
24545 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_5()
24546 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_5()
24550 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_5()
24551 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_5()
24552 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_5()
24553 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_5()
24556 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_5()
24558 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_5()
24559 std::int32_t* result_chunk = result; in gemm_i32_1_1_5()
24560 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_5()
24561 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_5()
24604 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_1_6()
24605 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_1_6()
24606 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_1_6()
24607 std::int32_t result_stride) { in gemm_i32_1_1_6()
24608 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_6()
24609 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_6()
24610 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_6()
24611 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_6()
24612 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_6()
24613 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_6()
24617 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_6()
24618 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_6()
24619 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_6()
24620 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_6()
24623 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_6()
24625 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_6()
24626 std::int32_t* result_chunk = result; in gemm_i32_1_1_6()
24627 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_6()
24628 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_6()
24671 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_1_7()
24672 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_1_7()
24673 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_1_7()
24674 std::int32_t result_stride) { in gemm_i32_1_1_7()
24675 const std::int32_t row_chunks = m / 3; in gemm_i32_1_1_7()
24676 const std::int32_t col_chunks = n / 3; in gemm_i32_1_1_7()
24677 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_1_7()
24678 const std::int32_t chunk_size = k * 3; in gemm_i32_1_1_7()
24679 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_1_7()
24680 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_1_7()
24684 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_1_7()
24685 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_1_7()
24686 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_1_7()
24687 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_1_7()
24690 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_1_7()
24692 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_1_7()
24693 std::int32_t* result_chunk = result; in gemm_i32_1_1_7()
24694 std::int32_t* mul_result_chunk = result; in gemm_i32_1_1_7()
24695 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_1_7()
24738 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_2_0()
24739 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_2_0()
24740 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_2_0()
24741 std::int32_t result_stride) { in gemm_i32_1_2_0()
24742 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_0()
24743 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_0()
24744 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_0()
24745 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_0()
24746 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_0()
24747 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_0()
24751 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_0()
24752 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_0()
24753 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_0()
24754 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_0()
24757 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_0()
24759 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_0()
24760 std::int32_t* result_chunk = result; in gemm_i32_1_2_0()
24761 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_0()
24762 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_0()
24805 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_2_1()
24806 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_2_1()
24807 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_2_1()
24808 std::int32_t result_stride) { in gemm_i32_1_2_1()
24809 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_1()
24810 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_1()
24811 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_1()
24812 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_1()
24813 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_1()
24814 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_1()
24818 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_1()
24819 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_1()
24820 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_1()
24821 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_1()
24824 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_1()
24826 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_1()
24827 std::int32_t* result_chunk = result; in gemm_i32_1_2_1()
24828 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_1()
24829 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_1()
24872 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_2_2()
24873 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_2_2()
24874 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_2_2()
24875 std::int32_t result_stride) { in gemm_i32_1_2_2()
24876 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_2()
24877 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_2()
24878 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_2()
24879 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_2()
24880 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_2()
24881 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_2()
24885 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_2()
24886 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_2()
24887 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_2()
24888 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_2()
24891 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_2()
24893 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_2()
24894 std::int32_t* result_chunk = result; in gemm_i32_1_2_2()
24895 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_2()
24896 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_2()
24939 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_2_3()
24940 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_2_3()
24941 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_2_3()
24942 std::int32_t result_stride) { in gemm_i32_1_2_3()
24943 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_3()
24944 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_3()
24945 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_3()
24946 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_3()
24947 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_3()
24948 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_3()
24952 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_3()
24953 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_3()
24954 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_3()
24955 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_3()
24958 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_3()
24960 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_3()
24961 std::int32_t* result_chunk = result; in gemm_i32_1_2_3()
24962 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_3()
24963 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_3()
25006 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_2_4()
25007 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_2_4()
25008 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_2_4()
25009 std::int32_t result_stride) { in gemm_i32_1_2_4()
25010 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_4()
25011 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_4()
25012 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_4()
25013 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_4()
25014 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_4()
25015 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_4()
25019 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_4()
25020 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_4()
25021 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_4()
25022 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_4()
25025 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_4()
25027 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_4()
25028 std::int32_t* result_chunk = result; in gemm_i32_1_2_4()
25029 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_4()
25030 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_4()
25073 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_2_5()
25074 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_2_5()
25075 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_2_5()
25076 std::int32_t result_stride) { in gemm_i32_1_2_5()
25077 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_5()
25078 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_5()
25079 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_5()
25080 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_5()
25081 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_5()
25082 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_5()
25086 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_5()
25087 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_5()
25088 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_5()
25089 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_5()
25092 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_5()
25094 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_5()
25095 std::int32_t* result_chunk = result; in gemm_i32_1_2_5()
25096 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_5()
25097 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_5()
25140 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_2_6()
25141 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_2_6()
25142 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_2_6()
25143 std::int32_t result_stride) { in gemm_i32_1_2_6()
25144 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_6()
25145 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_6()
25146 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_6()
25147 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_6()
25148 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_6()
25149 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_6()
25153 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_6()
25154 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_6()
25155 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_6()
25156 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_6()
25159 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_6()
25161 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_6()
25162 std::int32_t* result_chunk = result; in gemm_i32_1_2_6()
25163 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_6()
25164 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_6()
25207 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_1_2_7()
25208 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_1_2_7()
25209 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_1_2_7()
25210 std::int32_t result_stride) { in gemm_i32_1_2_7()
25211 const std::int32_t row_chunks = m / 3; in gemm_i32_1_2_7()
25212 const std::int32_t col_chunks = n / 3; in gemm_i32_1_2_7()
25213 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_1_2_7()
25214 const std::int32_t chunk_size = k * 3; in gemm_i32_1_2_7()
25215 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_1_2_7()
25216 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_1_2_7()
25220 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_1_2_7()
25221 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_1_2_7()
25222 std::int32_t* zipped_lhs_1_offsets = in gemm_i32_1_2_7()
25223 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_i32_1_2_7()
25226 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_1_2_7()
25228 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_1_2_7()
25229 std::int32_t* result_chunk = result; in gemm_i32_1_2_7()
25230 std::int32_t* mul_result_chunk = result; in gemm_i32_1_2_7()
25231 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_1_2_7()
25274 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_0_0()
25275 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_0_0()
25276 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_0_0()
25277 std::int32_t result_stride) { in gemm_i32_2_0_0()
25278 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_0()
25279 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_0()
25280 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_0()
25281 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_0()
25282 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_0()
25283 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_0()
25287 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_0()
25288 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_0()
25289 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_0()
25290 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_0()
25293 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_0()
25295 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_0()
25296 std::int32_t* result_chunk = result; in gemm_i32_2_0_0()
25297 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_0()
25298 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_0()
25334 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_0_1()
25335 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_0_1()
25336 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_0_1()
25337 std::int32_t result_stride) { in gemm_i32_2_0_1()
25338 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_1()
25339 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_1()
25340 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_1()
25341 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_1()
25342 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_1()
25343 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_1()
25347 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_1()
25348 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_1()
25349 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_1()
25350 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_1()
25353 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_1()
25355 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_1()
25356 std::int32_t* result_chunk = result; in gemm_i32_2_0_1()
25357 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_1()
25358 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_1()
25394 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_0_2()
25395 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_0_2()
25396 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_0_2()
25397 std::int32_t result_stride) { in gemm_i32_2_0_2()
25398 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_2()
25399 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_2()
25400 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_2()
25401 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_2()
25402 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_2()
25403 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_2()
25407 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_2()
25408 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_2()
25409 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_2()
25410 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_2()
25413 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_2()
25415 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_2()
25416 std::int32_t* result_chunk = result; in gemm_i32_2_0_2()
25417 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_2()
25418 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_2()
25454 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_0_3()
25455 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_0_3()
25456 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_0_3()
25457 std::int32_t result_stride) { in gemm_i32_2_0_3()
25458 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_3()
25459 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_3()
25460 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_3()
25461 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_3()
25462 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_3()
25463 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_3()
25467 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_3()
25468 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_3()
25469 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_3()
25470 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_3()
25473 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_3()
25475 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_3()
25476 std::int32_t* result_chunk = result; in gemm_i32_2_0_3()
25477 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_3()
25478 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_3()
25514 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_0_4()
25515 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_0_4()
25516 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_0_4()
25517 std::int32_t result_stride) { in gemm_i32_2_0_4()
25518 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_4()
25519 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_4()
25520 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_4()
25521 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_4()
25522 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_4()
25523 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_4()
25527 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_4()
25528 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_4()
25529 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_4()
25530 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_4()
25533 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_4()
25535 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_4()
25536 std::int32_t* result_chunk = result; in gemm_i32_2_0_4()
25537 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_4()
25538 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_4()
25574 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_0_5()
25575 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_0_5()
25576 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_0_5()
25577 std::int32_t result_stride) { in gemm_i32_2_0_5()
25578 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_5()
25579 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_5()
25580 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_5()
25581 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_5()
25582 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_5()
25583 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_5()
25587 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_5()
25588 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_5()
25589 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_5()
25590 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_5()
25593 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_5()
25595 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_5()
25596 std::int32_t* result_chunk = result; in gemm_i32_2_0_5()
25597 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_5()
25598 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_5()
25634 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_0_6()
25635 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_0_6()
25636 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_0_6()
25637 std::int32_t result_stride) { in gemm_i32_2_0_6()
25638 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_6()
25639 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_6()
25640 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_6()
25641 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_6()
25642 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_6()
25643 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_6()
25647 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_6()
25648 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_6()
25649 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_6()
25650 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_6()
25653 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_6()
25655 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_6()
25656 std::int32_t* result_chunk = result; in gemm_i32_2_0_6()
25657 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_6()
25658 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_6()
25694 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_0_7()
25695 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_0_7()
25696 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_0_7()
25697 std::int32_t result_stride) { in gemm_i32_2_0_7()
25698 const std::int32_t row_chunks = m / 3; in gemm_i32_2_0_7()
25699 const std::int32_t col_chunks = n / 3; in gemm_i32_2_0_7()
25700 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_0_7()
25701 const std::int32_t chunk_size = k * 3; in gemm_i32_2_0_7()
25702 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_0_7()
25703 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_0_7()
25707 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_0_7()
25708 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_0_7()
25709 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_0_7()
25710 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_0_7()
25713 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_0_7()
25715 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_0_7()
25716 std::int32_t* result_chunk = result; in gemm_i32_2_0_7()
25717 std::int32_t* mul_result_chunk = result; in gemm_i32_2_0_7()
25718 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_0_7()
25754 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_1_0()
25755 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_1_0()
25756 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_1_0()
25757 std::int32_t result_stride) { in gemm_i32_2_1_0()
25758 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_0()
25759 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_0()
25760 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_0()
25761 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_0()
25762 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_0()
25763 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_0()
25767 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_0()
25768 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_0()
25769 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_0()
25770 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_0()
25773 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_0()
25775 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_0()
25776 std::int32_t* result_chunk = result; in gemm_i32_2_1_0()
25777 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_0()
25778 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_0()
25821 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_1_1()
25822 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_1_1()
25823 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_1_1()
25824 std::int32_t result_stride) { in gemm_i32_2_1_1()
25825 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_1()
25826 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_1()
25827 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_1()
25828 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_1()
25829 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_1()
25830 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_1()
25834 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_1()
25835 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_1()
25836 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_1()
25837 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_1()
25840 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_1()
25842 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_1()
25843 std::int32_t* result_chunk = result; in gemm_i32_2_1_1()
25844 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_1()
25845 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_1()
25888 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_1_2()
25889 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_1_2()
25890 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_1_2()
25891 std::int32_t result_stride) { in gemm_i32_2_1_2()
25892 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_2()
25893 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_2()
25894 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_2()
25895 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_2()
25896 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_2()
25897 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_2()
25901 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_2()
25902 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_2()
25903 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_2()
25904 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_2()
25907 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_2()
25909 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_2()
25910 std::int32_t* result_chunk = result; in gemm_i32_2_1_2()
25911 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_2()
25912 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_2()
25955 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_1_3()
25956 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_1_3()
25957 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_1_3()
25958 std::int32_t result_stride) { in gemm_i32_2_1_3()
25959 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_3()
25960 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_3()
25961 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_3()
25962 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_3()
25963 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_3()
25964 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_3()
25968 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_3()
25969 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_3()
25970 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_3()
25971 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_3()
25974 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_3()
25976 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_3()
25977 std::int32_t* result_chunk = result; in gemm_i32_2_1_3()
25978 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_3()
25979 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_3()
26022 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_1_4()
26023 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_1_4()
26024 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_1_4()
26025 std::int32_t result_stride) { in gemm_i32_2_1_4()
26026 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_4()
26027 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_4()
26028 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_4()
26029 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_4()
26030 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_4()
26031 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_4()
26035 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_4()
26036 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_4()
26037 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_4()
26038 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_4()
26041 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_4()
26043 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_4()
26044 std::int32_t* result_chunk = result; in gemm_i32_2_1_4()
26045 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_4()
26046 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_4()
26089 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_1_5()
26090 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_1_5()
26091 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_1_5()
26092 std::int32_t result_stride) { in gemm_i32_2_1_5()
26093 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_5()
26094 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_5()
26095 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_5()
26096 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_5()
26097 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_5()
26098 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_5()
26102 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_5()
26103 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_5()
26104 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_5()
26105 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_5()
26108 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_5()
26110 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_5()
26111 std::int32_t* result_chunk = result; in gemm_i32_2_1_5()
26112 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_5()
26113 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_5()
26156 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_1_6()
26157 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_1_6()
26158 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_1_6()
26159 std::int32_t result_stride) { in gemm_i32_2_1_6()
26160 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_6()
26161 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_6()
26162 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_6()
26163 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_6()
26164 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_6()
26165 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_6()
26169 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_6()
26170 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_6()
26171 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_6()
26172 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_6()
26175 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_6()
26177 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_6()
26178 std::int32_t* result_chunk = result; in gemm_i32_2_1_6()
26179 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_6()
26180 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_6()
26223 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_1_7()
26224 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_1_7()
26225 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_1_7()
26226 std::int32_t result_stride) { in gemm_i32_2_1_7()
26227 const std::int32_t row_chunks = m / 3; in gemm_i32_2_1_7()
26228 const std::int32_t col_chunks = n / 3; in gemm_i32_2_1_7()
26229 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_1_7()
26230 const std::int32_t chunk_size = k * 3; in gemm_i32_2_1_7()
26231 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_1_7()
26232 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_1_7()
26236 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_1_7()
26237 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_1_7()
26238 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_1_7()
26239 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_1_7()
26242 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_1_7()
26244 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_1_7()
26245 std::int32_t* result_chunk = result; in gemm_i32_2_1_7()
26246 std::int32_t* mul_result_chunk = result; in gemm_i32_2_1_7()
26247 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_1_7()
26290 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_2_0()
26291 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_2_0()
26292 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_2_0()
26293 std::int32_t result_stride) { in gemm_i32_2_2_0()
26294 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_0()
26295 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_0()
26296 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_0()
26297 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_0()
26298 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_0()
26299 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_0()
26303 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_0()
26304 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_0()
26305 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_0()
26306 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_0()
26309 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_0()
26311 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_0()
26312 std::int32_t* result_chunk = result; in gemm_i32_2_2_0()
26313 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_0()
26314 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_0()
26357 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_2_1()
26358 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_2_1()
26359 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_2_1()
26360 std::int32_t result_stride) { in gemm_i32_2_2_1()
26361 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_1()
26362 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_1()
26363 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_1()
26364 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_1()
26365 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_1()
26366 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_1()
26370 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_1()
26371 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_1()
26372 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_1()
26373 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_1()
26376 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_1()
26378 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_1()
26379 std::int32_t* result_chunk = result; in gemm_i32_2_2_1()
26380 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_1()
26381 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_1()
26424 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_2_2()
26425 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_2_2()
26426 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_2_2()
26427 std::int32_t result_stride) { in gemm_i32_2_2_2()
26428 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_2()
26429 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_2()
26430 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_2()
26431 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_2()
26432 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_2()
26433 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_2()
26437 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_2()
26438 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_2()
26439 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_2()
26440 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_2()
26443 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_2()
26445 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_2()
26446 std::int32_t* result_chunk = result; in gemm_i32_2_2_2()
26447 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_2()
26448 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_2()
26491 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_2_3()
26492 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_2_3()
26493 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_2_3()
26494 std::int32_t result_stride) { in gemm_i32_2_2_3()
26495 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_3()
26496 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_3()
26497 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_3()
26498 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_3()
26499 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_3()
26500 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_3()
26504 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_3()
26505 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_3()
26506 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_3()
26507 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_3()
26510 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_3()
26512 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_3()
26513 std::int32_t* result_chunk = result; in gemm_i32_2_2_3()
26514 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_3()
26515 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_3()
26558 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_2_4()
26559 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_2_4()
26560 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_2_4()
26561 std::int32_t result_stride) { in gemm_i32_2_2_4()
26562 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_4()
26563 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_4()
26564 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_4()
26565 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_4()
26566 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_4()
26567 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_4()
26571 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_4()
26572 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_4()
26573 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_4()
26574 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_4()
26577 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_4()
26579 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_4()
26580 std::int32_t* result_chunk = result; in gemm_i32_2_2_4()
26581 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_4()
26582 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_4()
26625 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_2_5()
26626 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_2_5()
26627 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_2_5()
26628 std::int32_t result_stride) { in gemm_i32_2_2_5()
26629 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_5()
26630 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_5()
26631 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_5()
26632 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_5()
26633 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_5()
26634 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_5()
26638 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_5()
26639 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_5()
26640 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_5()
26641 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_5()
26644 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_5()
26646 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_5()
26647 std::int32_t* result_chunk = result; in gemm_i32_2_2_5()
26648 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_5()
26649 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_5()
26692 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_2_6()
26693 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_2_6()
26694 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_2_6()
26695 std::int32_t result_stride) { in gemm_i32_2_2_6()
26696 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_6()
26697 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_6()
26698 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_6()
26699 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_6()
26700 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_6()
26701 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_6()
26705 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_6()
26706 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_6()
26707 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_6()
26708 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_6()
26711 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_6()
26713 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_6()
26714 std::int32_t* result_chunk = result; in gemm_i32_2_2_6()
26715 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_6()
26716 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_6()
26759 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_2_2_7()
26760 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_2_2_7()
26761 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_2_2_7()
26762 std::int32_t result_stride) { in gemm_i32_2_2_7()
26763 const std::int32_t row_chunks = m / 3; in gemm_i32_2_2_7()
26764 const std::int32_t col_chunks = n / 3; in gemm_i32_2_2_7()
26765 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_i32_2_2_7()
26766 const std::int32_t chunk_size = k * 3; in gemm_i32_2_2_7()
26767 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_i32_2_2_7()
26768 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_i32_2_2_7()
26772 std::int32_t* zipped_lhs_3_offsets = in gemm_i32_2_2_7()
26773 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_i32_2_2_7()
26774 std::int32_t* zipped_lhs_2_offsets = in gemm_i32_2_2_7()
26775 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_i32_2_2_7()
26778 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_i32_2_2_7()
26780 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_i32_2_2_7()
26781 std::int32_t* result_chunk = result; in gemm_i32_2_2_7()
26782 std::int32_t* mul_result_chunk = result; in gemm_i32_2_2_7()
26783 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_i32_2_2_7()
26826 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_0_0_aligned()
26827 std::int32_t n, std::int32_t k, in gemm_f_0_0_0_aligned()
26828 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_0_0_aligned()
26830 std::int32_t result_stride) { in gemm_f_0_0_0_aligned()
26831 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_0_aligned()
26832 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_0_aligned()
26833 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_0_aligned()
26834 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_0_aligned()
26835 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_0_aligned()
26836 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_0_aligned()
26840 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_0_aligned()
26841 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_0_aligned()
26844 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_0_aligned()
26846 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_0_aligned()
26849 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_0_aligned()
26874 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_0_1_aligned()
26875 std::int32_t n, std::int32_t k, in gemm_f_0_0_1_aligned()
26876 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_0_1_aligned()
26878 std::int32_t result_stride) { in gemm_f_0_0_1_aligned()
26879 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_1_aligned()
26880 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_1_aligned()
26881 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_1_aligned()
26882 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_1_aligned()
26883 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_1_aligned()
26884 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_1_aligned()
26888 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_1_aligned()
26889 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_1_aligned()
26892 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_1_aligned()
26894 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_1_aligned()
26897 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_1_aligned()
26922 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_0_2_aligned()
26923 std::int32_t n, std::int32_t k, in gemm_f_0_0_2_aligned()
26924 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_0_2_aligned()
26926 std::int32_t result_stride) { in gemm_f_0_0_2_aligned()
26927 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_2_aligned()
26928 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_2_aligned()
26929 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_2_aligned()
26930 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_2_aligned()
26931 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_2_aligned()
26932 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_2_aligned()
26936 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_2_aligned()
26937 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_2_aligned()
26940 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_2_aligned()
26942 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_2_aligned()
26945 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_2_aligned()
26970 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_0_3_aligned()
26971 std::int32_t n, std::int32_t k, in gemm_f_0_0_3_aligned()
26972 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_0_3_aligned()
26974 std::int32_t result_stride) { in gemm_f_0_0_3_aligned()
26975 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_3_aligned()
26976 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_3_aligned()
26977 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_3_aligned()
26978 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_3_aligned()
26979 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_3_aligned()
26980 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_3_aligned()
26984 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_3_aligned()
26985 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_3_aligned()
26988 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_3_aligned()
26990 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_3_aligned()
26993 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_3_aligned()
27018 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_0_4_aligned()
27019 std::int32_t n, std::int32_t k, in gemm_f_0_0_4_aligned()
27020 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_0_4_aligned()
27022 std::int32_t result_stride) { in gemm_f_0_0_4_aligned()
27023 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_4_aligned()
27024 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_4_aligned()
27025 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_4_aligned()
27026 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_4_aligned()
27027 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_4_aligned()
27028 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_4_aligned()
27032 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_4_aligned()
27033 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_4_aligned()
27036 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_4_aligned()
27038 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_4_aligned()
27041 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_4_aligned()
27066 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_0_5_aligned()
27067 std::int32_t n, std::int32_t k, in gemm_f_0_0_5_aligned()
27068 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_0_5_aligned()
27070 std::int32_t result_stride) { in gemm_f_0_0_5_aligned()
27071 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_5_aligned()
27072 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_5_aligned()
27073 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_5_aligned()
27074 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_5_aligned()
27075 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_5_aligned()
27076 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_5_aligned()
27080 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_5_aligned()
27081 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_5_aligned()
27084 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_5_aligned()
27086 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_5_aligned()
27089 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_5_aligned()
27114 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_0_6_aligned()
27115 std::int32_t n, std::int32_t k, in gemm_f_0_0_6_aligned()
27116 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_0_6_aligned()
27118 std::int32_t result_stride) { in gemm_f_0_0_6_aligned()
27119 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_6_aligned()
27120 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_6_aligned()
27121 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_6_aligned()
27122 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_6_aligned()
27123 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_6_aligned()
27124 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_6_aligned()
27128 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_6_aligned()
27129 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_6_aligned()
27132 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_6_aligned()
27134 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_6_aligned()
27137 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_6_aligned()
27162 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_0_7_aligned()
27163 std::int32_t n, std::int32_t k, in gemm_f_0_0_7_aligned()
27164 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_0_7_aligned()
27166 std::int32_t result_stride) { in gemm_f_0_0_7_aligned()
27167 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_7_aligned()
27168 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_7_aligned()
27169 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_7_aligned()
27170 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_7_aligned()
27171 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_7_aligned()
27172 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_7_aligned()
27176 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_7_aligned()
27177 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_7_aligned()
27180 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_7_aligned()
27182 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_7_aligned()
27185 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_7_aligned()
27210 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_1_0_aligned()
27211 std::int32_t n, std::int32_t k, in gemm_f_0_1_0_aligned()
27212 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_1_0_aligned()
27214 std::int32_t result_stride) { in gemm_f_0_1_0_aligned()
27215 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_0_aligned()
27216 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_0_aligned()
27217 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_0_aligned()
27218 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_0_aligned()
27219 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_0_aligned()
27220 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_0_aligned()
27224 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_0_aligned()
27225 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_0_aligned()
27228 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_0_aligned()
27230 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_0_aligned()
27233 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_0_aligned()
27262 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_1_1_aligned()
27263 std::int32_t n, std::int32_t k, in gemm_f_0_1_1_aligned()
27264 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_1_1_aligned()
27266 std::int32_t result_stride) { in gemm_f_0_1_1_aligned()
27267 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_1_aligned()
27268 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_1_aligned()
27269 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_1_aligned()
27270 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_1_aligned()
27271 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_1_aligned()
27272 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_1_aligned()
27276 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_1_aligned()
27277 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_1_aligned()
27280 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_1_aligned()
27282 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_1_aligned()
27285 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_1_aligned()
27314 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_1_2_aligned()
27315 std::int32_t n, std::int32_t k, in gemm_f_0_1_2_aligned()
27316 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_1_2_aligned()
27318 std::int32_t result_stride) { in gemm_f_0_1_2_aligned()
27319 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_2_aligned()
27320 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_2_aligned()
27321 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_2_aligned()
27322 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_2_aligned()
27323 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_2_aligned()
27324 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_2_aligned()
27328 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_2_aligned()
27329 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_2_aligned()
27332 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_2_aligned()
27334 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_2_aligned()
27337 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_2_aligned()
27366 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_1_3_aligned()
27367 std::int32_t n, std::int32_t k, in gemm_f_0_1_3_aligned()
27368 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_1_3_aligned()
27370 std::int32_t result_stride) { in gemm_f_0_1_3_aligned()
27371 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_3_aligned()
27372 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_3_aligned()
27373 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_3_aligned()
27374 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_3_aligned()
27375 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_3_aligned()
27376 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_3_aligned()
27380 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_3_aligned()
27381 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_3_aligned()
27384 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_3_aligned()
27386 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_3_aligned()
27389 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_3_aligned()
27418 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_1_4_aligned()
27419 std::int32_t n, std::int32_t k, in gemm_f_0_1_4_aligned()
27420 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_1_4_aligned()
27422 std::int32_t result_stride) { in gemm_f_0_1_4_aligned()
27423 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_4_aligned()
27424 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_4_aligned()
27425 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_4_aligned()
27426 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_4_aligned()
27427 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_4_aligned()
27428 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_4_aligned()
27432 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_4_aligned()
27433 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_4_aligned()
27436 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_4_aligned()
27438 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_4_aligned()
27441 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_4_aligned()
27470 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_1_5_aligned()
27471 std::int32_t n, std::int32_t k, in gemm_f_0_1_5_aligned()
27472 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_1_5_aligned()
27474 std::int32_t result_stride) { in gemm_f_0_1_5_aligned()
27475 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_5_aligned()
27476 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_5_aligned()
27477 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_5_aligned()
27478 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_5_aligned()
27479 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_5_aligned()
27480 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_5_aligned()
27484 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_5_aligned()
27485 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_5_aligned()
27488 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_5_aligned()
27490 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_5_aligned()
27493 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_5_aligned()
27522 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_1_6_aligned()
27523 std::int32_t n, std::int32_t k, in gemm_f_0_1_6_aligned()
27524 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_1_6_aligned()
27526 std::int32_t result_stride) { in gemm_f_0_1_6_aligned()
27527 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_6_aligned()
27528 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_6_aligned()
27529 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_6_aligned()
27530 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_6_aligned()
27531 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_6_aligned()
27532 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_6_aligned()
27536 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_6_aligned()
27537 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_6_aligned()
27540 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_6_aligned()
27542 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_6_aligned()
27545 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_6_aligned()
27574 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_1_7_aligned()
27575 std::int32_t n, std::int32_t k, in gemm_f_0_1_7_aligned()
27576 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_1_7_aligned()
27578 std::int32_t result_stride) { in gemm_f_0_1_7_aligned()
27579 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_7_aligned()
27580 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_7_aligned()
27581 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_7_aligned()
27582 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_7_aligned()
27583 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_7_aligned()
27584 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_7_aligned()
27588 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_7_aligned()
27589 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_7_aligned()
27592 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_7_aligned()
27594 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_7_aligned()
27597 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_7_aligned()
27626 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_2_0_aligned()
27627 std::int32_t n, std::int32_t k, in gemm_f_0_2_0_aligned()
27628 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_2_0_aligned()
27630 std::int32_t result_stride) { in gemm_f_0_2_0_aligned()
27631 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_0_aligned()
27632 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_0_aligned()
27633 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_0_aligned()
27634 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_0_aligned()
27635 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_0_aligned()
27636 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_0_aligned()
27640 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_0_aligned()
27641 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_0_aligned()
27644 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_0_aligned()
27646 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_0_aligned()
27649 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_0_aligned()
27678 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_2_1_aligned()
27679 std::int32_t n, std::int32_t k, in gemm_f_0_2_1_aligned()
27680 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_2_1_aligned()
27682 std::int32_t result_stride) { in gemm_f_0_2_1_aligned()
27683 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_1_aligned()
27684 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_1_aligned()
27685 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_1_aligned()
27686 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_1_aligned()
27687 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_1_aligned()
27688 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_1_aligned()
27692 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_1_aligned()
27693 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_1_aligned()
27696 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_1_aligned()
27698 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_1_aligned()
27701 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_1_aligned()
27730 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_2_2_aligned()
27731 std::int32_t n, std::int32_t k, in gemm_f_0_2_2_aligned()
27732 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_2_2_aligned()
27734 std::int32_t result_stride) { in gemm_f_0_2_2_aligned()
27735 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_2_aligned()
27736 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_2_aligned()
27737 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_2_aligned()
27738 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_2_aligned()
27739 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_2_aligned()
27740 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_2_aligned()
27744 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_2_aligned()
27745 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_2_aligned()
27748 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_2_aligned()
27750 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_2_aligned()
27753 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_2_aligned()
27782 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_2_3_aligned()
27783 std::int32_t n, std::int32_t k, in gemm_f_0_2_3_aligned()
27784 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_2_3_aligned()
27786 std::int32_t result_stride) { in gemm_f_0_2_3_aligned()
27787 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_3_aligned()
27788 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_3_aligned()
27789 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_3_aligned()
27790 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_3_aligned()
27791 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_3_aligned()
27792 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_3_aligned()
27796 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_3_aligned()
27797 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_3_aligned()
27800 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_3_aligned()
27802 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_3_aligned()
27805 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_3_aligned()
27834 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_2_4_aligned()
27835 std::int32_t n, std::int32_t k, in gemm_f_0_2_4_aligned()
27836 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_2_4_aligned()
27838 std::int32_t result_stride) { in gemm_f_0_2_4_aligned()
27839 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_4_aligned()
27840 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_4_aligned()
27841 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_4_aligned()
27842 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_4_aligned()
27843 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_4_aligned()
27844 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_4_aligned()
27848 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_4_aligned()
27849 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_4_aligned()
27852 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_4_aligned()
27854 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_4_aligned()
27857 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_4_aligned()
27886 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_2_5_aligned()
27887 std::int32_t n, std::int32_t k, in gemm_f_0_2_5_aligned()
27888 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_2_5_aligned()
27890 std::int32_t result_stride) { in gemm_f_0_2_5_aligned()
27891 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_5_aligned()
27892 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_5_aligned()
27893 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_5_aligned()
27894 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_5_aligned()
27895 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_5_aligned()
27896 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_5_aligned()
27900 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_5_aligned()
27901 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_5_aligned()
27904 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_5_aligned()
27906 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_5_aligned()
27909 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_5_aligned()
27938 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_2_6_aligned()
27939 std::int32_t n, std::int32_t k, in gemm_f_0_2_6_aligned()
27940 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_2_6_aligned()
27942 std::int32_t result_stride) { in gemm_f_0_2_6_aligned()
27943 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_6_aligned()
27944 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_6_aligned()
27945 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_6_aligned()
27946 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_6_aligned()
27947 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_6_aligned()
27948 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_6_aligned()
27952 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_6_aligned()
27953 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_6_aligned()
27956 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_6_aligned()
27958 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_6_aligned()
27961 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_6_aligned()
27990 const std::uint8_t* rhs, std::int32_t m, in gemm_f_0_2_7_aligned()
27991 std::int32_t n, std::int32_t k, in gemm_f_0_2_7_aligned()
27992 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_0_2_7_aligned()
27994 std::int32_t result_stride) { in gemm_f_0_2_7_aligned()
27995 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_7_aligned()
27996 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_7_aligned()
27997 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_7_aligned()
27998 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_7_aligned()
27999 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_7_aligned()
28000 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_7_aligned()
28004 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_7_aligned()
28005 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_7_aligned()
28008 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_7_aligned()
28010 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_7_aligned()
28013 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_7_aligned()
28042 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_0_0_aligned()
28043 std::int32_t n, std::int32_t k, in gemm_f_1_0_0_aligned()
28044 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_0_0_aligned()
28046 std::int32_t result_stride) { in gemm_f_1_0_0_aligned()
28047 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_0_aligned()
28048 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_0_aligned()
28049 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_0_aligned()
28050 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_0_aligned()
28051 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_0_aligned()
28052 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_0_aligned()
28056 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_0_aligned()
28057 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_0_aligned()
28058 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_0_aligned()
28059 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_0_aligned()
28062 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_0_aligned()
28064 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_0_aligned()
28067 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_0_aligned()
28103 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_0_1_aligned()
28104 std::int32_t n, std::int32_t k, in gemm_f_1_0_1_aligned()
28105 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_0_1_aligned()
28107 std::int32_t result_stride) { in gemm_f_1_0_1_aligned()
28108 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_1_aligned()
28109 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_1_aligned()
28110 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_1_aligned()
28111 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_1_aligned()
28112 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_1_aligned()
28113 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_1_aligned()
28117 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_1_aligned()
28118 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_1_aligned()
28119 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_1_aligned()
28120 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_1_aligned()
28123 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_1_aligned()
28125 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_1_aligned()
28128 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_1_aligned()
28164 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_0_2_aligned()
28165 std::int32_t n, std::int32_t k, in gemm_f_1_0_2_aligned()
28166 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_0_2_aligned()
28168 std::int32_t result_stride) { in gemm_f_1_0_2_aligned()
28169 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_2_aligned()
28170 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_2_aligned()
28171 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_2_aligned()
28172 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_2_aligned()
28173 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_2_aligned()
28174 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_2_aligned()
28178 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_2_aligned()
28179 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_2_aligned()
28180 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_2_aligned()
28181 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_2_aligned()
28184 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_2_aligned()
28186 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_2_aligned()
28189 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_2_aligned()
28225 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_0_3_aligned()
28226 std::int32_t n, std::int32_t k, in gemm_f_1_0_3_aligned()
28227 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_0_3_aligned()
28229 std::int32_t result_stride) { in gemm_f_1_0_3_aligned()
28230 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_3_aligned()
28231 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_3_aligned()
28232 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_3_aligned()
28233 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_3_aligned()
28234 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_3_aligned()
28235 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_3_aligned()
28239 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_3_aligned()
28240 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_3_aligned()
28241 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_3_aligned()
28242 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_3_aligned()
28245 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_3_aligned()
28247 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_3_aligned()
28250 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_3_aligned()
28286 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_0_4_aligned()
28287 std::int32_t n, std::int32_t k, in gemm_f_1_0_4_aligned()
28288 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_0_4_aligned()
28290 std::int32_t result_stride) { in gemm_f_1_0_4_aligned()
28291 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_4_aligned()
28292 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_4_aligned()
28293 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_4_aligned()
28294 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_4_aligned()
28295 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_4_aligned()
28296 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_4_aligned()
28300 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_4_aligned()
28301 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_4_aligned()
28302 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_4_aligned()
28303 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_4_aligned()
28306 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_4_aligned()
28308 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_4_aligned()
28311 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_4_aligned()
28347 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_0_5_aligned()
28348 std::int32_t n, std::int32_t k, in gemm_f_1_0_5_aligned()
28349 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_0_5_aligned()
28351 std::int32_t result_stride) { in gemm_f_1_0_5_aligned()
28352 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_5_aligned()
28353 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_5_aligned()
28354 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_5_aligned()
28355 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_5_aligned()
28356 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_5_aligned()
28357 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_5_aligned()
28361 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_5_aligned()
28362 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_5_aligned()
28363 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_5_aligned()
28364 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_5_aligned()
28367 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_5_aligned()
28369 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_5_aligned()
28372 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_5_aligned()
28408 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_0_6_aligned()
28409 std::int32_t n, std::int32_t k, in gemm_f_1_0_6_aligned()
28410 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_0_6_aligned()
28412 std::int32_t result_stride) { in gemm_f_1_0_6_aligned()
28413 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_6_aligned()
28414 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_6_aligned()
28415 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_6_aligned()
28416 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_6_aligned()
28417 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_6_aligned()
28418 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_6_aligned()
28422 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_6_aligned()
28423 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_6_aligned()
28424 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_6_aligned()
28425 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_6_aligned()
28428 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_6_aligned()
28430 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_6_aligned()
28433 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_6_aligned()
28469 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_0_7_aligned()
28470 std::int32_t n, std::int32_t k, in gemm_f_1_0_7_aligned()
28471 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_0_7_aligned()
28473 std::int32_t result_stride) { in gemm_f_1_0_7_aligned()
28474 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_7_aligned()
28475 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_7_aligned()
28476 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_7_aligned()
28477 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_7_aligned()
28478 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_7_aligned()
28479 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_7_aligned()
28483 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_7_aligned()
28484 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_7_aligned()
28485 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_7_aligned()
28486 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_7_aligned()
28489 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_7_aligned()
28491 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_7_aligned()
28494 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_7_aligned()
28530 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_1_0_aligned()
28531 std::int32_t n, std::int32_t k, in gemm_f_1_1_0_aligned()
28532 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_1_0_aligned()
28534 std::int32_t result_stride) { in gemm_f_1_1_0_aligned()
28535 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_0_aligned()
28536 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_0_aligned()
28537 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_0_aligned()
28538 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_0_aligned()
28539 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_0_aligned()
28540 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_0_aligned()
28544 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_0_aligned()
28545 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_0_aligned()
28546 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_0_aligned()
28547 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_0_aligned()
28550 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_0_aligned()
28552 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_0_aligned()
28555 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_0_aligned()
28598 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_1_1_aligned()
28599 std::int32_t n, std::int32_t k, in gemm_f_1_1_1_aligned()
28600 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_1_1_aligned()
28602 std::int32_t result_stride) { in gemm_f_1_1_1_aligned()
28603 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_1_aligned()
28604 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_1_aligned()
28605 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_1_aligned()
28606 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_1_aligned()
28607 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_1_aligned()
28608 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_1_aligned()
28612 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_1_aligned()
28613 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_1_aligned()
28614 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_1_aligned()
28615 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_1_aligned()
28618 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_1_aligned()
28620 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_1_aligned()
28623 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_1_aligned()
28666 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_1_2_aligned()
28667 std::int32_t n, std::int32_t k, in gemm_f_1_1_2_aligned()
28668 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_1_2_aligned()
28670 std::int32_t result_stride) { in gemm_f_1_1_2_aligned()
28671 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_2_aligned()
28672 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_2_aligned()
28673 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_2_aligned()
28674 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_2_aligned()
28675 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_2_aligned()
28676 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_2_aligned()
28680 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_2_aligned()
28681 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_2_aligned()
28682 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_2_aligned()
28683 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_2_aligned()
28686 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_2_aligned()
28688 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_2_aligned()
28691 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_2_aligned()
28734 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_1_3_aligned()
28735 std::int32_t n, std::int32_t k, in gemm_f_1_1_3_aligned()
28736 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_1_3_aligned()
28738 std::int32_t result_stride) { in gemm_f_1_1_3_aligned()
28739 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_3_aligned()
28740 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_3_aligned()
28741 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_3_aligned()
28742 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_3_aligned()
28743 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_3_aligned()
28744 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_3_aligned()
28748 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_3_aligned()
28749 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_3_aligned()
28750 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_3_aligned()
28751 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_3_aligned()
28754 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_3_aligned()
28756 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_3_aligned()
28759 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_3_aligned()
28802 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_1_4_aligned()
28803 std::int32_t n, std::int32_t k, in gemm_f_1_1_4_aligned()
28804 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_1_4_aligned()
28806 std::int32_t result_stride) { in gemm_f_1_1_4_aligned()
28807 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_4_aligned()
28808 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_4_aligned()
28809 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_4_aligned()
28810 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_4_aligned()
28811 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_4_aligned()
28812 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_4_aligned()
28816 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_4_aligned()
28817 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_4_aligned()
28818 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_4_aligned()
28819 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_4_aligned()
28822 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_4_aligned()
28824 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_4_aligned()
28827 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_4_aligned()
28870 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_1_5_aligned()
28871 std::int32_t n, std::int32_t k, in gemm_f_1_1_5_aligned()
28872 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_1_5_aligned()
28874 std::int32_t result_stride) { in gemm_f_1_1_5_aligned()
28875 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_5_aligned()
28876 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_5_aligned()
28877 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_5_aligned()
28878 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_5_aligned()
28879 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_5_aligned()
28880 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_5_aligned()
28884 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_5_aligned()
28885 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_5_aligned()
28886 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_5_aligned()
28887 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_5_aligned()
28890 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_5_aligned()
28892 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_5_aligned()
28895 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_5_aligned()
28938 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_1_6_aligned()
28939 std::int32_t n, std::int32_t k, in gemm_f_1_1_6_aligned()
28940 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_1_6_aligned()
28942 std::int32_t result_stride) { in gemm_f_1_1_6_aligned()
28943 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_6_aligned()
28944 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_6_aligned()
28945 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_6_aligned()
28946 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_6_aligned()
28947 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_6_aligned()
28948 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_6_aligned()
28952 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_6_aligned()
28953 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_6_aligned()
28954 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_6_aligned()
28955 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_6_aligned()
28958 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_6_aligned()
28960 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_6_aligned()
28963 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_6_aligned()
29006 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_1_7_aligned()
29007 std::int32_t n, std::int32_t k, in gemm_f_1_1_7_aligned()
29008 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_1_7_aligned()
29010 std::int32_t result_stride) { in gemm_f_1_1_7_aligned()
29011 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_7_aligned()
29012 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_7_aligned()
29013 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_7_aligned()
29014 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_7_aligned()
29015 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_7_aligned()
29016 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_7_aligned()
29020 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_7_aligned()
29021 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_7_aligned()
29022 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_7_aligned()
29023 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_7_aligned()
29026 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_7_aligned()
29028 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_7_aligned()
29031 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_7_aligned()
29074 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_2_0_aligned()
29075 std::int32_t n, std::int32_t k, in gemm_f_1_2_0_aligned()
29076 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_2_0_aligned()
29078 std::int32_t result_stride) { in gemm_f_1_2_0_aligned()
29079 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_0_aligned()
29080 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_0_aligned()
29081 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_0_aligned()
29082 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_0_aligned()
29083 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_0_aligned()
29084 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_0_aligned()
29088 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_0_aligned()
29089 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_0_aligned()
29090 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_0_aligned()
29091 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_0_aligned()
29094 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_0_aligned()
29096 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_0_aligned()
29099 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_0_aligned()
29142 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_2_1_aligned()
29143 std::int32_t n, std::int32_t k, in gemm_f_1_2_1_aligned()
29144 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_2_1_aligned()
29146 std::int32_t result_stride) { in gemm_f_1_2_1_aligned()
29147 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_1_aligned()
29148 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_1_aligned()
29149 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_1_aligned()
29150 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_1_aligned()
29151 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_1_aligned()
29152 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_1_aligned()
29156 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_1_aligned()
29157 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_1_aligned()
29158 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_1_aligned()
29159 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_1_aligned()
29162 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_1_aligned()
29164 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_1_aligned()
29167 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_1_aligned()
29210 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_2_2_aligned()
29211 std::int32_t n, std::int32_t k, in gemm_f_1_2_2_aligned()
29212 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_2_2_aligned()
29214 std::int32_t result_stride) { in gemm_f_1_2_2_aligned()
29215 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_2_aligned()
29216 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_2_aligned()
29217 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_2_aligned()
29218 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_2_aligned()
29219 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_2_aligned()
29220 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_2_aligned()
29224 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_2_aligned()
29225 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_2_aligned()
29226 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_2_aligned()
29227 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_2_aligned()
29230 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_2_aligned()
29232 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_2_aligned()
29235 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_2_aligned()
29278 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_2_3_aligned()
29279 std::int32_t n, std::int32_t k, in gemm_f_1_2_3_aligned()
29280 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_2_3_aligned()
29282 std::int32_t result_stride) { in gemm_f_1_2_3_aligned()
29283 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_3_aligned()
29284 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_3_aligned()
29285 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_3_aligned()
29286 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_3_aligned()
29287 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_3_aligned()
29288 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_3_aligned()
29292 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_3_aligned()
29293 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_3_aligned()
29294 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_3_aligned()
29295 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_3_aligned()
29298 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_3_aligned()
29300 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_3_aligned()
29303 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_3_aligned()
29346 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_2_4_aligned()
29347 std::int32_t n, std::int32_t k, in gemm_f_1_2_4_aligned()
29348 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_2_4_aligned()
29350 std::int32_t result_stride) { in gemm_f_1_2_4_aligned()
29351 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_4_aligned()
29352 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_4_aligned()
29353 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_4_aligned()
29354 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_4_aligned()
29355 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_4_aligned()
29356 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_4_aligned()
29360 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_4_aligned()
29361 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_4_aligned()
29362 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_4_aligned()
29363 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_4_aligned()
29366 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_4_aligned()
29368 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_4_aligned()
29371 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_4_aligned()
29414 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_2_5_aligned()
29415 std::int32_t n, std::int32_t k, in gemm_f_1_2_5_aligned()
29416 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_2_5_aligned()
29418 std::int32_t result_stride) { in gemm_f_1_2_5_aligned()
29419 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_5_aligned()
29420 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_5_aligned()
29421 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_5_aligned()
29422 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_5_aligned()
29423 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_5_aligned()
29424 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_5_aligned()
29428 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_5_aligned()
29429 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_5_aligned()
29430 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_5_aligned()
29431 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_5_aligned()
29434 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_5_aligned()
29436 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_5_aligned()
29439 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_5_aligned()
29482 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_2_6_aligned()
29483 std::int32_t n, std::int32_t k, in gemm_f_1_2_6_aligned()
29484 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_2_6_aligned()
29486 std::int32_t result_stride) { in gemm_f_1_2_6_aligned()
29487 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_6_aligned()
29488 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_6_aligned()
29489 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_6_aligned()
29490 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_6_aligned()
29491 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_6_aligned()
29492 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_6_aligned()
29496 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_6_aligned()
29497 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_6_aligned()
29498 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_6_aligned()
29499 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_6_aligned()
29502 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_6_aligned()
29504 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_6_aligned()
29507 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_6_aligned()
29550 const std::uint8_t* rhs, std::int32_t m, in gemm_f_1_2_7_aligned()
29551 std::int32_t n, std::int32_t k, in gemm_f_1_2_7_aligned()
29552 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_1_2_7_aligned()
29554 std::int32_t result_stride) { in gemm_f_1_2_7_aligned()
29555 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_7_aligned()
29556 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_7_aligned()
29557 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_7_aligned()
29558 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_7_aligned()
29559 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_7_aligned()
29560 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_7_aligned()
29564 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_7_aligned()
29565 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_7_aligned()
29566 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_7_aligned()
29567 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_7_aligned()
29570 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_7_aligned()
29572 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_7_aligned()
29575 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_7_aligned()
29618 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_0_0_aligned()
29619 std::int32_t n, std::int32_t k, in gemm_f_2_0_0_aligned()
29620 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_0_0_aligned()
29622 std::int32_t result_stride) { in gemm_f_2_0_0_aligned()
29623 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_0_aligned()
29624 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_0_aligned()
29625 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_0_aligned()
29626 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_0_aligned()
29627 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_0_aligned()
29628 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_0_aligned()
29632 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_0_aligned()
29633 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_0_aligned()
29634 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_0_aligned()
29635 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_0_aligned()
29638 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_0_aligned()
29640 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_0_aligned()
29643 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_0_aligned()
29679 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_0_1_aligned()
29680 std::int32_t n, std::int32_t k, in gemm_f_2_0_1_aligned()
29681 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_0_1_aligned()
29683 std::int32_t result_stride) { in gemm_f_2_0_1_aligned()
29684 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_1_aligned()
29685 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_1_aligned()
29686 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_1_aligned()
29687 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_1_aligned()
29688 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_1_aligned()
29689 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_1_aligned()
29693 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_1_aligned()
29694 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_1_aligned()
29695 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_1_aligned()
29696 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_1_aligned()
29699 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_1_aligned()
29701 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_1_aligned()
29704 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_1_aligned()
29740 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_0_2_aligned()
29741 std::int32_t n, std::int32_t k, in gemm_f_2_0_2_aligned()
29742 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_0_2_aligned()
29744 std::int32_t result_stride) { in gemm_f_2_0_2_aligned()
29745 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_2_aligned()
29746 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_2_aligned()
29747 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_2_aligned()
29748 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_2_aligned()
29749 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_2_aligned()
29750 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_2_aligned()
29754 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_2_aligned()
29755 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_2_aligned()
29756 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_2_aligned()
29757 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_2_aligned()
29760 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_2_aligned()
29762 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_2_aligned()
29765 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_2_aligned()
29801 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_0_3_aligned()
29802 std::int32_t n, std::int32_t k, in gemm_f_2_0_3_aligned()
29803 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_0_3_aligned()
29805 std::int32_t result_stride) { in gemm_f_2_0_3_aligned()
29806 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_3_aligned()
29807 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_3_aligned()
29808 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_3_aligned()
29809 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_3_aligned()
29810 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_3_aligned()
29811 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_3_aligned()
29815 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_3_aligned()
29816 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_3_aligned()
29817 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_3_aligned()
29818 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_3_aligned()
29821 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_3_aligned()
29823 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_3_aligned()
29826 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_3_aligned()
29862 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_0_4_aligned()
29863 std::int32_t n, std::int32_t k, in gemm_f_2_0_4_aligned()
29864 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_0_4_aligned()
29866 std::int32_t result_stride) { in gemm_f_2_0_4_aligned()
29867 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_4_aligned()
29868 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_4_aligned()
29869 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_4_aligned()
29870 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_4_aligned()
29871 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_4_aligned()
29872 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_4_aligned()
29876 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_4_aligned()
29877 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_4_aligned()
29878 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_4_aligned()
29879 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_4_aligned()
29882 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_4_aligned()
29884 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_4_aligned()
29887 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_4_aligned()
29923 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_0_5_aligned()
29924 std::int32_t n, std::int32_t k, in gemm_f_2_0_5_aligned()
29925 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_0_5_aligned()
29927 std::int32_t result_stride) { in gemm_f_2_0_5_aligned()
29928 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_5_aligned()
29929 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_5_aligned()
29930 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_5_aligned()
29931 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_5_aligned()
29932 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_5_aligned()
29933 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_5_aligned()
29937 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_5_aligned()
29938 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_5_aligned()
29939 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_5_aligned()
29940 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_5_aligned()
29943 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_5_aligned()
29945 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_5_aligned()
29948 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_5_aligned()
29984 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_0_6_aligned()
29985 std::int32_t n, std::int32_t k, in gemm_f_2_0_6_aligned()
29986 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_0_6_aligned()
29988 std::int32_t result_stride) { in gemm_f_2_0_6_aligned()
29989 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_6_aligned()
29990 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_6_aligned()
29991 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_6_aligned()
29992 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_6_aligned()
29993 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_6_aligned()
29994 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_6_aligned()
29998 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_6_aligned()
29999 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_6_aligned()
30000 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_6_aligned()
30001 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_6_aligned()
30004 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_6_aligned()
30006 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_6_aligned()
30009 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_6_aligned()
30045 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_0_7_aligned()
30046 std::int32_t n, std::int32_t k, in gemm_f_2_0_7_aligned()
30047 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_0_7_aligned()
30049 std::int32_t result_stride) { in gemm_f_2_0_7_aligned()
30050 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_7_aligned()
30051 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_7_aligned()
30052 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_7_aligned()
30053 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_7_aligned()
30054 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_7_aligned()
30055 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_7_aligned()
30059 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_7_aligned()
30060 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_7_aligned()
30061 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_7_aligned()
30062 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_7_aligned()
30065 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_7_aligned()
30067 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_7_aligned()
30070 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_7_aligned()
30106 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_1_0_aligned()
30107 std::int32_t n, std::int32_t k, in gemm_f_2_1_0_aligned()
30108 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_1_0_aligned()
30110 std::int32_t result_stride) { in gemm_f_2_1_0_aligned()
30111 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_0_aligned()
30112 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_0_aligned()
30113 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_0_aligned()
30114 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_0_aligned()
30115 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_0_aligned()
30116 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_0_aligned()
30120 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_0_aligned()
30121 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_0_aligned()
30122 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_0_aligned()
30123 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_0_aligned()
30126 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_0_aligned()
30128 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_0_aligned()
30131 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_0_aligned()
30174 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_1_1_aligned()
30175 std::int32_t n, std::int32_t k, in gemm_f_2_1_1_aligned()
30176 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_1_1_aligned()
30178 std::int32_t result_stride) { in gemm_f_2_1_1_aligned()
30179 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_1_aligned()
30180 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_1_aligned()
30181 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_1_aligned()
30182 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_1_aligned()
30183 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_1_aligned()
30184 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_1_aligned()
30188 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_1_aligned()
30189 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_1_aligned()
30190 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_1_aligned()
30191 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_1_aligned()
30194 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_1_aligned()
30196 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_1_aligned()
30199 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_1_aligned()
30242 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_1_2_aligned()
30243 std::int32_t n, std::int32_t k, in gemm_f_2_1_2_aligned()
30244 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_1_2_aligned()
30246 std::int32_t result_stride) { in gemm_f_2_1_2_aligned()
30247 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_2_aligned()
30248 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_2_aligned()
30249 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_2_aligned()
30250 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_2_aligned()
30251 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_2_aligned()
30252 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_2_aligned()
30256 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_2_aligned()
30257 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_2_aligned()
30258 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_2_aligned()
30259 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_2_aligned()
30262 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_2_aligned()
30264 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_2_aligned()
30267 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_2_aligned()
30310 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_1_3_aligned()
30311 std::int32_t n, std::int32_t k, in gemm_f_2_1_3_aligned()
30312 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_1_3_aligned()
30314 std::int32_t result_stride) { in gemm_f_2_1_3_aligned()
30315 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_3_aligned()
30316 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_3_aligned()
30317 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_3_aligned()
30318 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_3_aligned()
30319 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_3_aligned()
30320 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_3_aligned()
30324 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_3_aligned()
30325 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_3_aligned()
30326 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_3_aligned()
30327 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_3_aligned()
30330 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_3_aligned()
30332 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_3_aligned()
30335 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_3_aligned()
30378 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_1_4_aligned()
30379 std::int32_t n, std::int32_t k, in gemm_f_2_1_4_aligned()
30380 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_1_4_aligned()
30382 std::int32_t result_stride) { in gemm_f_2_1_4_aligned()
30383 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_4_aligned()
30384 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_4_aligned()
30385 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_4_aligned()
30386 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_4_aligned()
30387 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_4_aligned()
30388 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_4_aligned()
30392 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_4_aligned()
30393 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_4_aligned()
30394 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_4_aligned()
30395 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_4_aligned()
30398 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_4_aligned()
30400 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_4_aligned()
30403 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_4_aligned()
30446 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_1_5_aligned()
30447 std::int32_t n, std::int32_t k, in gemm_f_2_1_5_aligned()
30448 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_1_5_aligned()
30450 std::int32_t result_stride) { in gemm_f_2_1_5_aligned()
30451 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_5_aligned()
30452 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_5_aligned()
30453 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_5_aligned()
30454 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_5_aligned()
30455 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_5_aligned()
30456 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_5_aligned()
30460 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_5_aligned()
30461 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_5_aligned()
30462 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_5_aligned()
30463 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_5_aligned()
30466 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_5_aligned()
30468 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_5_aligned()
30471 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_5_aligned()
30514 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_1_6_aligned()
30515 std::int32_t n, std::int32_t k, in gemm_f_2_1_6_aligned()
30516 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_1_6_aligned()
30518 std::int32_t result_stride) { in gemm_f_2_1_6_aligned()
30519 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_6_aligned()
30520 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_6_aligned()
30521 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_6_aligned()
30522 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_6_aligned()
30523 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_6_aligned()
30524 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_6_aligned()
30528 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_6_aligned()
30529 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_6_aligned()
30530 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_6_aligned()
30531 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_6_aligned()
30534 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_6_aligned()
30536 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_6_aligned()
30539 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_6_aligned()
30582 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_1_7_aligned()
30583 std::int32_t n, std::int32_t k, in gemm_f_2_1_7_aligned()
30584 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_1_7_aligned()
30586 std::int32_t result_stride) { in gemm_f_2_1_7_aligned()
30587 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_7_aligned()
30588 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_7_aligned()
30589 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_7_aligned()
30590 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_7_aligned()
30591 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_7_aligned()
30592 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_7_aligned()
30596 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_7_aligned()
30597 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_7_aligned()
30598 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_7_aligned()
30599 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_7_aligned()
30602 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_7_aligned()
30604 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_7_aligned()
30607 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_7_aligned()
30650 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_2_0_aligned()
30651 std::int32_t n, std::int32_t k, in gemm_f_2_2_0_aligned()
30652 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_2_0_aligned()
30654 std::int32_t result_stride) { in gemm_f_2_2_0_aligned()
30655 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_0_aligned()
30656 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_0_aligned()
30657 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_0_aligned()
30658 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_0_aligned()
30659 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_0_aligned()
30660 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_0_aligned()
30664 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_0_aligned()
30665 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_0_aligned()
30666 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_0_aligned()
30667 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_0_aligned()
30670 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_0_aligned()
30672 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_0_aligned()
30675 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_0_aligned()
30718 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_2_1_aligned()
30719 std::int32_t n, std::int32_t k, in gemm_f_2_2_1_aligned()
30720 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_2_1_aligned()
30722 std::int32_t result_stride) { in gemm_f_2_2_1_aligned()
30723 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_1_aligned()
30724 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_1_aligned()
30725 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_1_aligned()
30726 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_1_aligned()
30727 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_1_aligned()
30728 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_1_aligned()
30732 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_1_aligned()
30733 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_1_aligned()
30734 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_1_aligned()
30735 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_1_aligned()
30738 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_1_aligned()
30740 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_1_aligned()
30743 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_1_aligned()
30786 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_2_2_aligned()
30787 std::int32_t n, std::int32_t k, in gemm_f_2_2_2_aligned()
30788 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_2_2_aligned()
30790 std::int32_t result_stride) { in gemm_f_2_2_2_aligned()
30791 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_2_aligned()
30792 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_2_aligned()
30793 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_2_aligned()
30794 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_2_aligned()
30795 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_2_aligned()
30796 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_2_aligned()
30800 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_2_aligned()
30801 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_2_aligned()
30802 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_2_aligned()
30803 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_2_aligned()
30806 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_2_aligned()
30808 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_2_aligned()
30811 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_2_aligned()
30854 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_2_3_aligned()
30855 std::int32_t n, std::int32_t k, in gemm_f_2_2_3_aligned()
30856 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_2_3_aligned()
30858 std::int32_t result_stride) { in gemm_f_2_2_3_aligned()
30859 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_3_aligned()
30860 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_3_aligned()
30861 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_3_aligned()
30862 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_3_aligned()
30863 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_3_aligned()
30864 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_3_aligned()
30868 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_3_aligned()
30869 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_3_aligned()
30870 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_3_aligned()
30871 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_3_aligned()
30874 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_3_aligned()
30876 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_3_aligned()
30879 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_3_aligned()
30922 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_2_4_aligned()
30923 std::int32_t n, std::int32_t k, in gemm_f_2_2_4_aligned()
30924 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_2_4_aligned()
30926 std::int32_t result_stride) { in gemm_f_2_2_4_aligned()
30927 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_4_aligned()
30928 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_4_aligned()
30929 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_4_aligned()
30930 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_4_aligned()
30931 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_4_aligned()
30932 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_4_aligned()
30936 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_4_aligned()
30937 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_4_aligned()
30938 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_4_aligned()
30939 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_4_aligned()
30942 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_4_aligned()
30944 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_4_aligned()
30947 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_4_aligned()
30990 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_2_5_aligned()
30991 std::int32_t n, std::int32_t k, in gemm_f_2_2_5_aligned()
30992 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_2_5_aligned()
30994 std::int32_t result_stride) { in gemm_f_2_2_5_aligned()
30995 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_5_aligned()
30996 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_5_aligned()
30997 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_5_aligned()
30998 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_5_aligned()
30999 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_5_aligned()
31000 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_5_aligned()
31004 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_5_aligned()
31005 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_5_aligned()
31006 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_5_aligned()
31007 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_5_aligned()
31010 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_5_aligned()
31012 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_5_aligned()
31015 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_5_aligned()
31058 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_2_6_aligned()
31059 std::int32_t n, std::int32_t k, in gemm_f_2_2_6_aligned()
31060 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_2_6_aligned()
31062 std::int32_t result_stride) { in gemm_f_2_2_6_aligned()
31063 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_6_aligned()
31064 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_6_aligned()
31065 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_6_aligned()
31066 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_6_aligned()
31067 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_6_aligned()
31068 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_6_aligned()
31072 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_6_aligned()
31073 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_6_aligned()
31074 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_6_aligned()
31075 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_6_aligned()
31078 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_6_aligned()
31080 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_6_aligned()
31083 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_6_aligned()
31126 const std::uint8_t* rhs, std::int32_t m, in gemm_f_2_2_7_aligned()
31127 std::int32_t n, std::int32_t k, in gemm_f_2_2_7_aligned()
31128 std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f_2_2_7_aligned()
31130 std::int32_t result_stride) { in gemm_f_2_2_7_aligned()
31131 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_7_aligned()
31132 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_7_aligned()
31133 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_7_aligned()
31134 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_7_aligned()
31135 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_7_aligned()
31136 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_7_aligned()
31140 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_7_aligned()
31141 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_7_aligned()
31142 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_7_aligned()
31143 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_7_aligned()
31146 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_7_aligned()
31148 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_7_aligned()
31151 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_7_aligned()
31194 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_0_0()
31195 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_0_0()
31196 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_0_0()
31197 std::int32_t result_stride) { in gemm_f_0_0_0()
31198 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_0()
31199 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_0()
31200 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_0()
31201 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_0()
31202 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_0()
31203 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_0()
31207 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_0()
31208 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_0()
31211 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_0()
31213 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_0()
31216 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_0()
31241 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_0_1()
31242 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_0_1()
31243 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_0_1()
31244 std::int32_t result_stride) { in gemm_f_0_0_1()
31245 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_1()
31246 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_1()
31247 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_1()
31248 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_1()
31249 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_1()
31250 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_1()
31254 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_1()
31255 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_1()
31258 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_1()
31260 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_1()
31263 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_1()
31288 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_0_2()
31289 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_0_2()
31290 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_0_2()
31291 std::int32_t result_stride) { in gemm_f_0_0_2()
31292 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_2()
31293 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_2()
31294 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_2()
31295 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_2()
31296 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_2()
31297 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_2()
31301 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_2()
31302 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_2()
31305 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_2()
31307 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_2()
31310 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_2()
31335 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_0_3()
31336 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_0_3()
31337 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_0_3()
31338 std::int32_t result_stride) { in gemm_f_0_0_3()
31339 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_3()
31340 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_3()
31341 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_3()
31342 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_3()
31343 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_3()
31344 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_3()
31348 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_3()
31349 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_3()
31352 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_3()
31354 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_3()
31357 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_3()
31382 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_0_4()
31383 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_0_4()
31384 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_0_4()
31385 std::int32_t result_stride) { in gemm_f_0_0_4()
31386 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_4()
31387 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_4()
31388 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_4()
31389 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_4()
31390 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_4()
31391 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_4()
31395 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_4()
31396 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_4()
31399 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_4()
31401 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_4()
31404 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_4()
31429 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_0_5()
31430 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_0_5()
31431 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_0_5()
31432 std::int32_t result_stride) { in gemm_f_0_0_5()
31433 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_5()
31434 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_5()
31435 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_5()
31436 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_5()
31437 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_5()
31438 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_5()
31442 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_5()
31443 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_5()
31446 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_5()
31448 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_5()
31451 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_5()
31476 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_0_6()
31477 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_0_6()
31478 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_0_6()
31479 std::int32_t result_stride) { in gemm_f_0_0_6()
31480 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_6()
31481 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_6()
31482 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_6()
31483 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_6()
31484 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_6()
31485 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_6()
31489 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_6()
31490 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_6()
31493 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_6()
31495 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_6()
31498 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_6()
31523 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_0_7()
31524 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_0_7()
31525 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_0_7()
31526 std::int32_t result_stride) { in gemm_f_0_0_7()
31527 const std::int32_t row_chunks = m / 3; in gemm_f_0_0_7()
31528 const std::int32_t col_chunks = n / 3; in gemm_f_0_0_7()
31529 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_0_7()
31530 const std::int32_t chunk_size = k * 3; in gemm_f_0_0_7()
31531 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_0_7()
31532 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_0_7()
31536 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_0_7()
31537 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_0_7()
31540 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_0_7()
31542 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_0_7()
31545 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_0_7()
31570 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_1_0()
31571 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_1_0()
31572 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_1_0()
31573 std::int32_t result_stride) { in gemm_f_0_1_0()
31574 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_0()
31575 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_0()
31576 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_0()
31577 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_0()
31578 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_0()
31579 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_0()
31583 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_0()
31584 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_0()
31587 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_0()
31589 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_0()
31592 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_0()
31621 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_1_1()
31622 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_1_1()
31623 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_1_1()
31624 std::int32_t result_stride) { in gemm_f_0_1_1()
31625 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_1()
31626 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_1()
31627 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_1()
31628 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_1()
31629 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_1()
31630 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_1()
31634 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_1()
31635 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_1()
31638 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_1()
31640 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_1()
31643 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_1()
31672 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_1_2()
31673 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_1_2()
31674 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_1_2()
31675 std::int32_t result_stride) { in gemm_f_0_1_2()
31676 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_2()
31677 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_2()
31678 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_2()
31679 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_2()
31680 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_2()
31681 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_2()
31685 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_2()
31686 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_2()
31689 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_2()
31691 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_2()
31694 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_2()
31723 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_1_3()
31724 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_1_3()
31725 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_1_3()
31726 std::int32_t result_stride) { in gemm_f_0_1_3()
31727 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_3()
31728 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_3()
31729 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_3()
31730 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_3()
31731 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_3()
31732 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_3()
31736 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_3()
31737 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_3()
31740 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_3()
31742 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_3()
31745 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_3()
31774 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_1_4()
31775 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_1_4()
31776 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_1_4()
31777 std::int32_t result_stride) { in gemm_f_0_1_4()
31778 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_4()
31779 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_4()
31780 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_4()
31781 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_4()
31782 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_4()
31783 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_4()
31787 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_4()
31788 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_4()
31791 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_4()
31793 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_4()
31796 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_4()
31825 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_1_5()
31826 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_1_5()
31827 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_1_5()
31828 std::int32_t result_stride) { in gemm_f_0_1_5()
31829 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_5()
31830 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_5()
31831 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_5()
31832 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_5()
31833 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_5()
31834 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_5()
31838 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_5()
31839 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_5()
31842 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_5()
31844 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_5()
31847 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_5()
31876 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_1_6()
31877 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_1_6()
31878 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_1_6()
31879 std::int32_t result_stride) { in gemm_f_0_1_6()
31880 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_6()
31881 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_6()
31882 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_6()
31883 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_6()
31884 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_6()
31885 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_6()
31889 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_6()
31890 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_6()
31893 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_6()
31895 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_6()
31898 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_6()
31927 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_1_7()
31928 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_1_7()
31929 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_1_7()
31930 std::int32_t result_stride) { in gemm_f_0_1_7()
31931 const std::int32_t row_chunks = m / 3; in gemm_f_0_1_7()
31932 const std::int32_t col_chunks = n / 3; in gemm_f_0_1_7()
31933 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_1_7()
31934 const std::int32_t chunk_size = k * 3; in gemm_f_0_1_7()
31935 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_1_7()
31936 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_1_7()
31940 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_1_7()
31941 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_1_7()
31944 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_1_7()
31946 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_1_7()
31949 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_1_7()
31978 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_2_0()
31979 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_2_0()
31980 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_2_0()
31981 std::int32_t result_stride) { in gemm_f_0_2_0()
31982 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_0()
31983 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_0()
31984 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_0()
31985 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_0()
31986 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_0()
31987 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_0()
31991 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_0()
31992 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_0()
31995 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_0()
31997 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_0()
32000 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_0()
32029 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_2_1()
32030 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_2_1()
32031 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_2_1()
32032 std::int32_t result_stride) { in gemm_f_0_2_1()
32033 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_1()
32034 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_1()
32035 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_1()
32036 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_1()
32037 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_1()
32038 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_1()
32042 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_1()
32043 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_1()
32046 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_1()
32048 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_1()
32051 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_1()
32080 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_2_2()
32081 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_2_2()
32082 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_2_2()
32083 std::int32_t result_stride) { in gemm_f_0_2_2()
32084 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_2()
32085 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_2()
32086 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_2()
32087 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_2()
32088 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_2()
32089 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_2()
32093 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_2()
32094 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_2()
32097 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_2()
32099 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_2()
32102 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_2()
32131 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_2_3()
32132 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_2_3()
32133 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_2_3()
32134 std::int32_t result_stride) { in gemm_f_0_2_3()
32135 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_3()
32136 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_3()
32137 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_3()
32138 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_3()
32139 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_3()
32140 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_3()
32144 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_3()
32145 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_3()
32148 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_3()
32150 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_3()
32153 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_3()
32182 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_2_4()
32183 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_2_4()
32184 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_2_4()
32185 std::int32_t result_stride) { in gemm_f_0_2_4()
32186 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_4()
32187 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_4()
32188 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_4()
32189 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_4()
32190 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_4()
32191 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_4()
32195 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_4()
32196 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_4()
32199 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_4()
32201 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_4()
32204 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_4()
32233 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_2_5()
32234 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_2_5()
32235 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_2_5()
32236 std::int32_t result_stride) { in gemm_f_0_2_5()
32237 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_5()
32238 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_5()
32239 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_5()
32240 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_5()
32241 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_5()
32242 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_5()
32246 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_5()
32247 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_5()
32250 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_5()
32252 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_5()
32255 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_5()
32284 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_2_6()
32285 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_2_6()
32286 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_2_6()
32287 std::int32_t result_stride) { in gemm_f_0_2_6()
32288 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_6()
32289 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_6()
32290 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_6()
32291 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_6()
32292 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_6()
32293 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_6()
32297 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_6()
32298 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_6()
32301 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_6()
32303 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_6()
32306 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_6()
32335 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_0_2_7()
32336 std::int32_t k, std::int32_t lhs_offset, in gemm_f_0_2_7()
32337 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_0_2_7()
32338 std::int32_t result_stride) { in gemm_f_0_2_7()
32339 const std::int32_t row_chunks = m / 3; in gemm_f_0_2_7()
32340 const std::int32_t col_chunks = n / 3; in gemm_f_0_2_7()
32341 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_0_2_7()
32342 const std::int32_t chunk_size = k * 3; in gemm_f_0_2_7()
32343 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_0_2_7()
32344 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_0_2_7()
32348 std::int32_t* zipped_lhs_3_offsets = in gemm_f_0_2_7()
32349 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_0_2_7()
32352 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_0_2_7()
32354 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_0_2_7()
32357 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_0_2_7()
32386 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_0_0()
32387 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_0_0()
32388 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_0_0()
32389 std::int32_t result_stride) { in gemm_f_1_0_0()
32390 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_0()
32391 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_0()
32392 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_0()
32393 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_0()
32394 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_0()
32395 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_0()
32399 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_0()
32400 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_0()
32401 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_0()
32402 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_0()
32405 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_0()
32407 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_0()
32410 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_0()
32446 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_0_1()
32447 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_0_1()
32448 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_0_1()
32449 std::int32_t result_stride) { in gemm_f_1_0_1()
32450 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_1()
32451 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_1()
32452 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_1()
32453 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_1()
32454 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_1()
32455 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_1()
32459 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_1()
32460 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_1()
32461 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_1()
32462 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_1()
32465 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_1()
32467 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_1()
32470 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_1()
32506 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_0_2()
32507 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_0_2()
32508 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_0_2()
32509 std::int32_t result_stride) { in gemm_f_1_0_2()
32510 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_2()
32511 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_2()
32512 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_2()
32513 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_2()
32514 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_2()
32515 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_2()
32519 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_2()
32520 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_2()
32521 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_2()
32522 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_2()
32525 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_2()
32527 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_2()
32530 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_2()
32566 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_0_3()
32567 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_0_3()
32568 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_0_3()
32569 std::int32_t result_stride) { in gemm_f_1_0_3()
32570 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_3()
32571 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_3()
32572 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_3()
32573 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_3()
32574 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_3()
32575 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_3()
32579 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_3()
32580 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_3()
32581 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_3()
32582 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_3()
32585 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_3()
32587 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_3()
32590 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_3()
32626 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_0_4()
32627 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_0_4()
32628 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_0_4()
32629 std::int32_t result_stride) { in gemm_f_1_0_4()
32630 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_4()
32631 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_4()
32632 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_4()
32633 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_4()
32634 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_4()
32635 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_4()
32639 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_4()
32640 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_4()
32641 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_4()
32642 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_4()
32645 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_4()
32647 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_4()
32650 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_4()
32686 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_0_5()
32687 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_0_5()
32688 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_0_5()
32689 std::int32_t result_stride) { in gemm_f_1_0_5()
32690 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_5()
32691 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_5()
32692 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_5()
32693 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_5()
32694 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_5()
32695 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_5()
32699 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_5()
32700 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_5()
32701 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_5()
32702 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_5()
32705 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_5()
32707 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_5()
32710 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_5()
32746 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_0_6()
32747 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_0_6()
32748 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_0_6()
32749 std::int32_t result_stride) { in gemm_f_1_0_6()
32750 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_6()
32751 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_6()
32752 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_6()
32753 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_6()
32754 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_6()
32755 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_6()
32759 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_6()
32760 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_6()
32761 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_6()
32762 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_6()
32765 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_6()
32767 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_6()
32770 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_6()
32806 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_0_7()
32807 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_0_7()
32808 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_0_7()
32809 std::int32_t result_stride) { in gemm_f_1_0_7()
32810 const std::int32_t row_chunks = m / 3; in gemm_f_1_0_7()
32811 const std::int32_t col_chunks = n / 3; in gemm_f_1_0_7()
32812 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_0_7()
32813 const std::int32_t chunk_size = k * 3; in gemm_f_1_0_7()
32814 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_0_7()
32815 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_0_7()
32819 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_0_7()
32820 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_0_7()
32821 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_0_7()
32822 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_0_7()
32825 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_0_7()
32827 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_0_7()
32830 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_0_7()
32866 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_1_0()
32867 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_1_0()
32868 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_1_0()
32869 std::int32_t result_stride) { in gemm_f_1_1_0()
32870 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_0()
32871 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_0()
32872 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_0()
32873 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_0()
32874 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_0()
32875 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_0()
32879 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_0()
32880 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_0()
32881 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_0()
32882 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_0()
32885 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_0()
32887 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_0()
32890 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_0()
32933 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_1_1()
32934 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_1_1()
32935 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_1_1()
32936 std::int32_t result_stride) { in gemm_f_1_1_1()
32937 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_1()
32938 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_1()
32939 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_1()
32940 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_1()
32941 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_1()
32942 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_1()
32946 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_1()
32947 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_1()
32948 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_1()
32949 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_1()
32952 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_1()
32954 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_1()
32957 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_1()
33000 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_1_2()
33001 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_1_2()
33002 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_1_2()
33003 std::int32_t result_stride) { in gemm_f_1_1_2()
33004 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_2()
33005 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_2()
33006 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_2()
33007 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_2()
33008 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_2()
33009 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_2()
33013 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_2()
33014 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_2()
33015 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_2()
33016 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_2()
33019 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_2()
33021 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_2()
33024 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_2()
33067 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_1_3()
33068 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_1_3()
33069 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_1_3()
33070 std::int32_t result_stride) { in gemm_f_1_1_3()
33071 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_3()
33072 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_3()
33073 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_3()
33074 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_3()
33075 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_3()
33076 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_3()
33080 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_3()
33081 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_3()
33082 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_3()
33083 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_3()
33086 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_3()
33088 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_3()
33091 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_3()
33134 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_1_4()
33135 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_1_4()
33136 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_1_4()
33137 std::int32_t result_stride) { in gemm_f_1_1_4()
33138 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_4()
33139 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_4()
33140 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_4()
33141 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_4()
33142 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_4()
33143 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_4()
33147 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_4()
33148 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_4()
33149 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_4()
33150 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_4()
33153 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_4()
33155 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_4()
33158 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_4()
33201 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_1_5()
33202 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_1_5()
33203 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_1_5()
33204 std::int32_t result_stride) { in gemm_f_1_1_5()
33205 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_5()
33206 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_5()
33207 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_5()
33208 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_5()
33209 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_5()
33210 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_5()
33214 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_5()
33215 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_5()
33216 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_5()
33217 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_5()
33220 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_5()
33222 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_5()
33225 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_5()
33268 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_1_6()
33269 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_1_6()
33270 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_1_6()
33271 std::int32_t result_stride) { in gemm_f_1_1_6()
33272 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_6()
33273 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_6()
33274 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_6()
33275 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_6()
33276 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_6()
33277 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_6()
33281 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_6()
33282 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_6()
33283 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_6()
33284 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_6()
33287 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_6()
33289 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_6()
33292 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_6()
33335 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_1_7()
33336 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_1_7()
33337 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_1_7()
33338 std::int32_t result_stride) { in gemm_f_1_1_7()
33339 const std::int32_t row_chunks = m / 3; in gemm_f_1_1_7()
33340 const std::int32_t col_chunks = n / 3; in gemm_f_1_1_7()
33341 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_1_7()
33342 const std::int32_t chunk_size = k * 3; in gemm_f_1_1_7()
33343 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_1_7()
33344 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_1_7()
33348 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_1_7()
33349 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_1_7()
33350 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_1_7()
33351 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_1_7()
33354 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_1_7()
33356 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_1_7()
33359 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_1_7()
33402 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_2_0()
33403 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_2_0()
33404 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_2_0()
33405 std::int32_t result_stride) { in gemm_f_1_2_0()
33406 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_0()
33407 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_0()
33408 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_0()
33409 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_0()
33410 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_0()
33411 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_0()
33415 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_0()
33416 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_0()
33417 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_0()
33418 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_0()
33421 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_0()
33423 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_0()
33426 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_0()
33469 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_2_1()
33470 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_2_1()
33471 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_2_1()
33472 std::int32_t result_stride) { in gemm_f_1_2_1()
33473 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_1()
33474 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_1()
33475 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_1()
33476 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_1()
33477 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_1()
33478 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_1()
33482 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_1()
33483 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_1()
33484 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_1()
33485 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_1()
33488 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_1()
33490 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_1()
33493 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_1()
33536 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_2_2()
33537 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_2_2()
33538 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_2_2()
33539 std::int32_t result_stride) { in gemm_f_1_2_2()
33540 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_2()
33541 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_2()
33542 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_2()
33543 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_2()
33544 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_2()
33545 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_2()
33549 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_2()
33550 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_2()
33551 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_2()
33552 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_2()
33555 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_2()
33557 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_2()
33560 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_2()
33603 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_2_3()
33604 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_2_3()
33605 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_2_3()
33606 std::int32_t result_stride) { in gemm_f_1_2_3()
33607 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_3()
33608 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_3()
33609 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_3()
33610 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_3()
33611 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_3()
33612 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_3()
33616 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_3()
33617 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_3()
33618 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_3()
33619 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_3()
33622 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_3()
33624 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_3()
33627 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_3()
33670 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_2_4()
33671 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_2_4()
33672 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_2_4()
33673 std::int32_t result_stride) { in gemm_f_1_2_4()
33674 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_4()
33675 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_4()
33676 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_4()
33677 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_4()
33678 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_4()
33679 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_4()
33683 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_4()
33684 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_4()
33685 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_4()
33686 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_4()
33689 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_4()
33691 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_4()
33694 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_4()
33737 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_2_5()
33738 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_2_5()
33739 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_2_5()
33740 std::int32_t result_stride) { in gemm_f_1_2_5()
33741 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_5()
33742 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_5()
33743 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_5()
33744 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_5()
33745 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_5()
33746 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_5()
33750 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_5()
33751 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_5()
33752 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_5()
33753 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_5()
33756 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_5()
33758 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_5()
33761 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_5()
33804 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_2_6()
33805 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_2_6()
33806 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_2_6()
33807 std::int32_t result_stride) { in gemm_f_1_2_6()
33808 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_6()
33809 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_6()
33810 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_6()
33811 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_6()
33812 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_6()
33813 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_6()
33817 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_6()
33818 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_6()
33819 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_6()
33820 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_6()
33823 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_6()
33825 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_6()
33828 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_6()
33871 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_1_2_7()
33872 std::int32_t k, std::int32_t lhs_offset, in gemm_f_1_2_7()
33873 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_1_2_7()
33874 std::int32_t result_stride) { in gemm_f_1_2_7()
33875 const std::int32_t row_chunks = m / 3; in gemm_f_1_2_7()
33876 const std::int32_t col_chunks = n / 3; in gemm_f_1_2_7()
33877 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_1_2_7()
33878 const std::int32_t chunk_size = k * 3; in gemm_f_1_2_7()
33879 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_1_2_7()
33880 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_1_2_7()
33884 std::int32_t* zipped_lhs_3_offsets = in gemm_f_1_2_7()
33885 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_1_2_7()
33886 std::int32_t* zipped_lhs_1_offsets = in gemm_f_1_2_7()
33887 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 1); in gemm_f_1_2_7()
33890 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_1_2_7()
33892 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_1_2_7()
33895 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_1_2_7()
33938 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_0_0()
33939 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_0_0()
33940 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_0_0()
33941 std::int32_t result_stride) { in gemm_f_2_0_0()
33942 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_0()
33943 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_0()
33944 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_0()
33945 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_0()
33946 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_0()
33947 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_0()
33951 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_0()
33952 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_0()
33953 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_0()
33954 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_0()
33957 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_0()
33959 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_0()
33962 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_0()
33998 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_0_1()
33999 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_0_1()
34000 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_0_1()
34001 std::int32_t result_stride) { in gemm_f_2_0_1()
34002 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_1()
34003 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_1()
34004 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_1()
34005 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_1()
34006 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_1()
34007 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_1()
34011 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_1()
34012 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_1()
34013 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_1()
34014 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_1()
34017 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_1()
34019 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_1()
34022 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_1()
34058 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_0_2()
34059 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_0_2()
34060 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_0_2()
34061 std::int32_t result_stride) { in gemm_f_2_0_2()
34062 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_2()
34063 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_2()
34064 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_2()
34065 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_2()
34066 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_2()
34067 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_2()
34071 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_2()
34072 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_2()
34073 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_2()
34074 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_2()
34077 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_2()
34079 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_2()
34082 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_2()
34118 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_0_3()
34119 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_0_3()
34120 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_0_3()
34121 std::int32_t result_stride) { in gemm_f_2_0_3()
34122 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_3()
34123 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_3()
34124 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_3()
34125 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_3()
34126 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_3()
34127 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_3()
34131 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_3()
34132 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_3()
34133 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_3()
34134 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_3()
34137 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_3()
34139 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_3()
34142 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_3()
34178 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_0_4()
34179 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_0_4()
34180 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_0_4()
34181 std::int32_t result_stride) { in gemm_f_2_0_4()
34182 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_4()
34183 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_4()
34184 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_4()
34185 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_4()
34186 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_4()
34187 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_4()
34191 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_4()
34192 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_4()
34193 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_4()
34194 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_4()
34197 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_4()
34199 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_4()
34202 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_4()
34238 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_0_5()
34239 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_0_5()
34240 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_0_5()
34241 std::int32_t result_stride) { in gemm_f_2_0_5()
34242 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_5()
34243 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_5()
34244 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_5()
34245 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_5()
34246 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_5()
34247 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_5()
34251 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_5()
34252 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_5()
34253 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_5()
34254 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_5()
34257 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_5()
34259 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_5()
34262 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_5()
34298 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_0_6()
34299 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_0_6()
34300 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_0_6()
34301 std::int32_t result_stride) { in gemm_f_2_0_6()
34302 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_6()
34303 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_6()
34304 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_6()
34305 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_6()
34306 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_6()
34307 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_6()
34311 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_6()
34312 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_6()
34313 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_6()
34314 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_6()
34317 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_6()
34319 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_6()
34322 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_6()
34358 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_0_7()
34359 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_0_7()
34360 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_0_7()
34361 std::int32_t result_stride) { in gemm_f_2_0_7()
34362 const std::int32_t row_chunks = m / 3; in gemm_f_2_0_7()
34363 const std::int32_t col_chunks = n / 3; in gemm_f_2_0_7()
34364 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_0_7()
34365 const std::int32_t chunk_size = k * 3; in gemm_f_2_0_7()
34366 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_0_7()
34367 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_0_7()
34371 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_0_7()
34372 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_0_7()
34373 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_0_7()
34374 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_0_7()
34377 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_0_7()
34379 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_0_7()
34382 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_0_7()
34418 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_1_0()
34419 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_1_0()
34420 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_1_0()
34421 std::int32_t result_stride) { in gemm_f_2_1_0()
34422 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_0()
34423 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_0()
34424 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_0()
34425 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_0()
34426 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_0()
34427 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_0()
34431 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_0()
34432 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_0()
34433 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_0()
34434 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_0()
34437 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_0()
34439 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_0()
34442 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_0()
34485 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_1_1()
34486 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_1_1()
34487 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_1_1()
34488 std::int32_t result_stride) { in gemm_f_2_1_1()
34489 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_1()
34490 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_1()
34491 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_1()
34492 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_1()
34493 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_1()
34494 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_1()
34498 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_1()
34499 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_1()
34500 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_1()
34501 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_1()
34504 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_1()
34506 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_1()
34509 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_1()
34552 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_1_2()
34553 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_1_2()
34554 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_1_2()
34555 std::int32_t result_stride) { in gemm_f_2_1_2()
34556 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_2()
34557 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_2()
34558 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_2()
34559 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_2()
34560 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_2()
34561 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_2()
34565 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_2()
34566 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_2()
34567 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_2()
34568 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_2()
34571 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_2()
34573 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_2()
34576 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_2()
34619 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_1_3()
34620 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_1_3()
34621 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_1_3()
34622 std::int32_t result_stride) { in gemm_f_2_1_3()
34623 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_3()
34624 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_3()
34625 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_3()
34626 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_3()
34627 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_3()
34628 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_3()
34632 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_3()
34633 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_3()
34634 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_3()
34635 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_3()
34638 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_3()
34640 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_3()
34643 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_3()
34686 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_1_4()
34687 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_1_4()
34688 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_1_4()
34689 std::int32_t result_stride) { in gemm_f_2_1_4()
34690 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_4()
34691 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_4()
34692 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_4()
34693 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_4()
34694 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_4()
34695 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_4()
34699 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_4()
34700 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_4()
34701 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_4()
34702 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_4()
34705 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_4()
34707 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_4()
34710 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_4()
34753 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_1_5()
34754 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_1_5()
34755 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_1_5()
34756 std::int32_t result_stride) { in gemm_f_2_1_5()
34757 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_5()
34758 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_5()
34759 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_5()
34760 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_5()
34761 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_5()
34762 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_5()
34766 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_5()
34767 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_5()
34768 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_5()
34769 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_5()
34772 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_5()
34774 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_5()
34777 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_5()
34820 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_1_6()
34821 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_1_6()
34822 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_1_6()
34823 std::int32_t result_stride) { in gemm_f_2_1_6()
34824 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_6()
34825 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_6()
34826 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_6()
34827 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_6()
34828 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_6()
34829 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_6()
34833 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_6()
34834 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_6()
34835 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_6()
34836 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_6()
34839 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_6()
34841 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_6()
34844 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_6()
34887 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_1_7()
34888 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_1_7()
34889 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_1_7()
34890 std::int32_t result_stride) { in gemm_f_2_1_7()
34891 const std::int32_t row_chunks = m / 3; in gemm_f_2_1_7()
34892 const std::int32_t col_chunks = n / 3; in gemm_f_2_1_7()
34893 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_1_7()
34894 const std::int32_t chunk_size = k * 3; in gemm_f_2_1_7()
34895 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_1_7()
34896 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_1_7()
34900 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_1_7()
34901 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_1_7()
34902 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_1_7()
34903 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_1_7()
34906 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_1_7()
34908 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_1_7()
34911 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_1_7()
34954 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_2_0()
34955 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_2_0()
34956 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_2_0()
34957 std::int32_t result_stride) { in gemm_f_2_2_0()
34958 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_0()
34959 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_0()
34960 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_0()
34961 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_0()
34962 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_0()
34963 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_0()
34967 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_0()
34968 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_0()
34969 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_0()
34970 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_0()
34973 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_0()
34975 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_0()
34978 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_0()
35021 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_2_1()
35022 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_2_1()
35023 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_2_1()
35024 std::int32_t result_stride) { in gemm_f_2_2_1()
35025 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_1()
35026 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_1()
35027 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_1()
35028 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_1()
35029 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_1()
35030 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_1()
35034 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_1()
35035 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_1()
35036 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_1()
35037 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_1()
35040 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_1()
35042 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_1()
35045 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_1()
35088 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_2_2()
35089 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_2_2()
35090 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_2_2()
35091 std::int32_t result_stride) { in gemm_f_2_2_2()
35092 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_2()
35093 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_2()
35094 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_2()
35095 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_2()
35096 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_2()
35097 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_2()
35101 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_2()
35102 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_2()
35103 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_2()
35104 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_2()
35107 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_2()
35109 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_2()
35112 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_2()
35155 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_2_3()
35156 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_2_3()
35157 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_2_3()
35158 std::int32_t result_stride) { in gemm_f_2_2_3()
35159 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_3()
35160 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_3()
35161 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_3()
35162 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_3()
35163 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_3()
35164 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_3()
35168 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_3()
35169 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_3()
35170 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_3()
35171 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_3()
35174 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_3()
35176 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_3()
35179 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_3()
35222 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_2_4()
35223 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_2_4()
35224 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_2_4()
35225 std::int32_t result_stride) { in gemm_f_2_2_4()
35226 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_4()
35227 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_4()
35228 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_4()
35229 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_4()
35230 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_4()
35231 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_4()
35235 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_4()
35236 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_4()
35237 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_4()
35238 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_4()
35241 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_4()
35243 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_4()
35246 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_4()
35289 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_2_5()
35290 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_2_5()
35291 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_2_5()
35292 std::int32_t result_stride) { in gemm_f_2_2_5()
35293 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_5()
35294 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_5()
35295 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_5()
35296 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_5()
35297 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_5()
35298 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_5()
35302 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_5()
35303 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_5()
35304 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_5()
35305 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_5()
35308 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_5()
35310 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_5()
35313 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_5()
35356 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_2_6()
35357 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_2_6()
35358 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_2_6()
35359 std::int32_t result_stride) { in gemm_f_2_2_6()
35360 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_6()
35361 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_6()
35362 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_6()
35363 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_6()
35364 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_6()
35365 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_6()
35369 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_6()
35370 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_6()
35371 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_6()
35372 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_6()
35375 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_6()
35377 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_6()
35380 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_6()
35423 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_2_2_7()
35424 std::int32_t k, std::int32_t lhs_offset, in gemm_f_2_2_7()
35425 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_2_2_7()
35426 std::int32_t result_stride) { in gemm_f_2_2_7()
35427 const std::int32_t row_chunks = m / 3; in gemm_f_2_2_7()
35428 const std::int32_t col_chunks = n / 3; in gemm_f_2_2_7()
35429 const std::int32_t padded_k = ((k + 7) / 8) * 8; in gemm_f_2_2_7()
35430 const std::int32_t chunk_size = k * 3; in gemm_f_2_2_7()
35431 const std::int32_t zipped_chunk_size = (padded_k + 16) * 3; in gemm_f_2_2_7()
35432 const std::int32_t zipped_rhs_size = (padded_k + 16) * n; in gemm_f_2_2_7()
35436 std::int32_t* zipped_lhs_3_offsets = in gemm_f_2_2_7()
35437 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 3); in gemm_f_2_2_7()
35438 std::int32_t* zipped_lhs_2_offsets = in gemm_f_2_2_7()
35439 reinterpret_cast<std::int32_t*>(zipped_lhs + padded_k * 2); in gemm_f_2_2_7()
35442 const std::int32_t result_chunk_stride = result_stride * 3; in gemm_f_2_2_7()
35444 const std::int32_t const_offset = lhs_offset * rhs_offset * k; in gemm_f_2_2_7()
35447 const std::int32_t mul_result_chunk_stride_bytes = result_stride * 4; in gemm_f_2_2_7()
35492 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8_strided()
35493 std::int32_t k, std::int32_t lhs_offset, in gemm_q8_strided()
35494 std::int32_t rhs_offset, std::int32_t result_offset, in gemm_q8_strided()
35495 std::int32_t multiplicative_offset, std::int32_t shift, in gemm_q8_strided()
35496 std::uint8_t* result, std::int32_t result_stride) { in gemm_q8_strided()
36475 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32_strided()
36476 std::int32_t k, std::int32_t lhs_offset, in gemm_i32_strided()
36477 std::int32_t rhs_offset, std::int32_t* result, in gemm_i32_strided()
36478 std::int32_t result_stride) { in gemm_i32_strided()
37237 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f_strided()
37238 std::int32_t k, std::int32_t lhs_offset, in gemm_f_strided()
37239 std::int32_t rhs_offset, float result_scale, float* result, in gemm_f_strided()
37240 std::int32_t result_stride) { in gemm_f_strided()
38071 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_q8()
38072 std::int32_t k, std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_q8()
38073 std::int32_t result_offset, std::int32_t multiplicative_offset, in gemm_q8()
38074 std::int32_t shift, std::uint8_t* result) { in gemm_q8()
38080 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_i32()
38081 std::int32_t k, std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_i32()
38082 std::int32_t* result) { in gemm_i32()
38088 const std::uint8_t* rhs, std::int32_t m, std::int32_t n, in gemm_f()
38089 std::int32_t k, std::int32_t lhs_offset, std::int32_t rhs_offset, in gemm_f()