1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_ 16 #define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_ 17 18 #ifdef GEMMLOWP_NEON_32 19 20 #include <cassert> 21 #include <cstdint> 22 23 namespace gemmlowp { 24 namespace meta { 25 26 template <> 27 inline void 28 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)29 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 30 const FusedKernelParams<QuantizedStaticPreprocessed, 31 RowMajor>& params, 32 uint8_t* result) { 33 #ifdef DEBUG 34 #ifdef DEBUG_METAGEMM_VERBOSE 35 std::cout << __FILE__ << "(" << __LINE__ 36 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 37 "QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()" 38 << std::endl 39 << std::flush; 40 #endif 41 #endif 42 asm volatile( 43 "pld [%[lhs]]\n" 44 "pld [%[rhs]]\n" 45 46 // Clear aggregators. 47 "vmov.i32 q0, #0\n" 48 49 // General NxM lanes loop. 50 "1:" 51 52 // Subtract counter. 53 "subs %[count], %[count], #8\n" 54 55 "vld1.32 {d2}, [%[lhs]:64]!\n" 56 "vld1.32 {d3}, [%[rhs]:64]!\n" 57 "pld [%[lhs], #64]\n" 58 "pld [%[rhs], #64]\n" 59 "vmull.u8 q2, d3, d2\n" 60 "vpadal.u16 q0, q2\n" 61 62 // Loop break. 63 "bgt 1b\n" 64 65 // StaticQuantization::Prepare 66 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 67 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 68 "vdup.32 q6, %[multiplicative_offset]\n" 69 "vdup.32 q7, %[rounding_offset]\n" 70 "vdup.32 q8, %[shift]\n" 71 "vdup.32 q4, d8[0]\n" 72 73 // RowMajorOutput::Prepare 74 75 // Reduce aggregators. 76 "vpadd.u32 d0, d0, d1\n" 77 "vpadd.u32 d0, d0, d0\n" 78 79 // StaticQuantization::Transform 80 "vadd.s32 q0, q0, q4\n" 81 "vadd.s32 q0, q0, q5\n" 82 "vmul.i32 q0, q0, q6\n" 83 "vadd.i32 q0, q0, q7\n" 84 "vshl.s32 q0, q0, q8\n" 85 "vqmovn.s32 d0, q0\n" 86 "vqmovun.s16 d0, q0\n" 87 88 // RowMajorOutput::Output 89 "vst1.8 {d0[0]}, [%[result]]!\n" 90 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 91 : [count] "r"(params.kernel.count), 92 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 93 [shift] "r"(params.kernel.shift), 94 [stride] "r"(params.output_stream.stride), 95 [rounding_offset] "r"(params.kernel.rounding_offset) 96 : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "d12", 97 "d13", "d14", "d15", "d16", "d17", "cc", "memory"); 98 } 99 100 template <> 101 inline void 102 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)103 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 104 const FusedKernelParams<QuantizedStaticPreprocessed, 105 RowMajor>& params, 106 uint8_t* result) { 107 #ifdef DEBUG 108 #ifdef DEBUG_METAGEMM_VERBOSE 109 std::cout << __FILE__ << "(" << __LINE__ 110 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 111 "QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()" 112 << std::endl 113 << std::flush; 114 #endif 115 #endif 116 asm volatile( 117 "pld [%[lhs]]\n" 118 "pld [%[rhs]]\n" 119 120 // Clear aggregators. 121 "vmov.i32 q0, #0\n" 122 "vmov.i32 q1, #0\n" 123 124 // General NxM lanes loop. 125 "1:" 126 127 // Subtract counter. 128 "subs %[count], %[count], #8\n" 129 130 "vld1.32 {d4}, [%[lhs]:64]!\n" 131 "vld1.32 {d5, d6}, [%[rhs]:64]!\n" 132 "pld [%[lhs], #64]\n" 133 "pld [%[rhs], #64]\n" 134 "vmull.u8 q4, d5, d4\n" 135 "vmull.u8 q5, d6, d4\n" 136 "vpadal.u16 q0, q4\n" 137 "vpadal.u16 q1, q5\n" 138 139 // Loop break. 140 "bgt 1b\n" 141 142 // StaticQuantization::Prepare 143 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 144 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 145 "vdup.32 q6, %[multiplicative_offset]\n" 146 "vdup.32 q7, %[rounding_offset]\n" 147 "vdup.32 q8, %[shift]\n" 148 "vdup.32 q4, d8[0]\n" 149 150 // RowMajorOutput::Prepare 151 152 // Reduce aggregators. 153 "vpadd.u32 d0, d0, d1\n" 154 "vpadd.u32 d2, d2, d3\n" 155 "vpadd.u32 d0, d0, d2\n" 156 157 // StaticQuantization::Transform 158 "vadd.s32 q0, q0, q4\n" 159 "vadd.s32 q0, q0, q5\n" 160 "vmul.i32 q0, q0, q6\n" 161 "vadd.i32 q0, q0, q7\n" 162 "vshl.s32 q0, q0, q8\n" 163 "vqmovn.s32 d0, q0\n" 164 "vqmovun.s16 d0, q0\n" 165 166 // RowMajorOutput::Output 167 "vst1.16 {d0[0]}, [%[result]]!\n" 168 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 169 : [count] "r"(params.kernel.count), 170 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 171 [shift] "r"(params.kernel.shift), 172 [stride] "r"(params.output_stream.stride), 173 [rounding_offset] "r"(params.kernel.rounding_offset) 174 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11", 175 "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory"); 176 } 177 178 template <> 179 inline void 180 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)181 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 182 const FusedKernelParams<QuantizedStaticPreprocessed, 183 RowMajor>& params, 184 uint8_t* result) { 185 #ifdef DEBUG 186 #ifdef DEBUG_METAGEMM_VERBOSE 187 std::cout << __FILE__ << "(" << __LINE__ 188 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 189 "QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()" 190 << std::endl 191 << std::flush; 192 #endif 193 #endif 194 asm volatile( 195 "pld [%[lhs]]\n" 196 "pld [%[rhs]]\n" 197 198 // Clear aggregators. 199 "vmov.i32 q0, #0\n" 200 "vmov.i32 q1, #0\n" 201 "vmov.i32 q2, #0\n" 202 203 // General NxM lanes loop. 204 "1:" 205 206 // Subtract counter. 207 "subs %[count], %[count], #8\n" 208 209 "vld1.32 {d6}, [%[lhs]:64]!\n" 210 "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n" 211 "pld [%[lhs], #64]\n" 212 "pld [%[rhs], #64]\n" 213 "vmull.u8 q5, d7, d6\n" 214 "vmull.u8 q6, d8, d6\n" 215 "vmull.u8 q7, d9, d6\n" 216 "vpadal.u16 q0, q5\n" 217 "vpadal.u16 q1, q6\n" 218 "vpadal.u16 q2, q7\n" 219 220 // Loop break. 221 "bgt 1b\n" 222 223 // StaticQuantization::Prepare 224 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 225 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 226 "vdup.32 q6, %[multiplicative_offset]\n" 227 "vdup.32 q7, %[rounding_offset]\n" 228 "vdup.32 q8, %[shift]\n" 229 "vdup.32 q4, d8[0]\n" 230 231 // RowMajorOutput::Prepare 232 233 // Reduce aggregators. 234 "vpadd.u32 d0, d0, d1\n" 235 "vpadd.u32 d2, d2, d3\n" 236 "vpadd.u32 d4, d4, d5\n" 237 "vpadd.u32 d0, d0, d2\n" 238 "vpadd.u32 d1, d4, d4\n" 239 240 // StaticQuantization::Transform 241 "vadd.s32 q0, q0, q4\n" 242 "vadd.s32 q0, q0, q5\n" 243 "vmul.i32 q0, q0, q6\n" 244 "vadd.i32 q0, q0, q7\n" 245 "vshl.s32 q0, q0, q8\n" 246 "vqmovn.s32 d0, q0\n" 247 "vqmovun.s16 d0, q0\n" 248 249 // RowMajorOutput::Output 250 "vst1.16 {d0[0]}, [%[result]]!\n" 251 "vst1.8 {d0[2]}, [%[result]]!\n" 252 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 253 : [count] "r"(params.kernel.count), 254 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 255 [shift] "r"(params.kernel.shift), 256 [stride] "r"(params.output_stream.stride), 257 [rounding_offset] "r"(params.kernel.rounding_offset) 258 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 259 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory"); 260 } 261 262 template <> 263 inline void 264 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)265 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 266 const FusedKernelParams<QuantizedStaticPreprocessed, 267 RowMajor>& params, 268 uint8_t* result) { 269 #ifdef DEBUG 270 #ifdef DEBUG_METAGEMM_VERBOSE 271 std::cout << __FILE__ << "(" << __LINE__ 272 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 273 "QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()" 274 << std::endl 275 << std::flush; 276 #endif 277 #endif 278 asm volatile( 279 "pld [%[lhs]]\n" 280 "pld [%[rhs]]\n" 281 282 // Clear aggregators. 283 "vmov.i32 q0, #0\n" 284 "vmov.i32 q1, #0\n" 285 "vmov.i32 q2, #0\n" 286 "vmov.i32 q3, q0\n" 287 288 // General NxM lanes loop. 289 "1:" 290 291 // Subtract counter. 292 "subs %[count], %[count], #8\n" 293 294 "vld1.32 {d8}, [%[lhs]:64]!\n" 295 "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n" 296 "pld [%[lhs], #64]\n" 297 "pld [%[rhs], #64]\n" 298 "vmull.u8 q7, d9, d8\n" 299 "vmull.u8 q8, d10, d8\n" 300 "vmull.u8 q9, d11, d8\n" 301 "vmull.u8 q10, d12, d8\n" 302 "vpadal.u16 q0, q7\n" 303 "vpadal.u16 q1, q8\n" 304 "vpadal.u16 q2, q9\n" 305 "vpadal.u16 q3, q10\n" 306 307 // Loop break. 308 "bgt 1b\n" 309 310 // StaticQuantization::Prepare 311 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 312 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 313 "vdup.32 q6, %[multiplicative_offset]\n" 314 "vdup.32 q7, %[rounding_offset]\n" 315 "vdup.32 q8, %[shift]\n" 316 "vdup.32 q4, d8[0]\n" 317 318 // RowMajorOutput::Prepare 319 320 // Reduce aggregators. 321 "vpadd.u32 d0, d0, d1\n" 322 "vpadd.u32 d2, d2, d3\n" 323 "vpadd.u32 d4, d4, d5\n" 324 "vpadd.u32 d6, d6, d7\n" 325 "vpadd.u32 d0, d0, d2\n" 326 "vpadd.u32 d1, d4, d6\n" 327 328 // StaticQuantization::Transform 329 "vadd.s32 q0, q0, q4\n" 330 "vadd.s32 q0, q0, q5\n" 331 "vmul.i32 q0, q0, q6\n" 332 "vadd.i32 q0, q0, q7\n" 333 "vshl.s32 q0, q0, q8\n" 334 "vqmovn.s32 d0, q0\n" 335 "vqmovun.s16 d0, q0\n" 336 337 // RowMajorOutput::Output 338 "vst1.32 {d0[0]}, [%[result]]!\n" 339 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 340 : [count] "r"(params.kernel.count), 341 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 342 [shift] "r"(params.kernel.shift), 343 [stride] "r"(params.output_stream.stride), 344 [rounding_offset] "r"(params.kernel.rounding_offset) 345 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 346 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 347 "d21", "cc", "memory"); 348 } 349 350 template <> 351 inline void 352 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)353 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 354 const FusedKernelParams<QuantizedStaticPreprocessed, 355 RowMajor>& params, 356 uint8_t* result) { 357 #ifdef DEBUG 358 #ifdef DEBUG_METAGEMM_VERBOSE 359 std::cout << __FILE__ << "(" << __LINE__ 360 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 361 "QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()" 362 << std::endl 363 << std::flush; 364 #endif 365 #endif 366 asm volatile( 367 "pld [%[lhs]]\n" 368 "pld [%[rhs]]\n" 369 370 // Clear aggregators. 371 "vmov.i32 q0, #0\n" 372 "vmov.i32 q1, #0\n" 373 "vmov.i32 q2, #0\n" 374 "vmov.i32 q3, q0\n" 375 "vmov.i32 q4, q1\n" 376 377 // General 1xM lanes loop. 378 "1:" 379 380 // Subtract counter. 381 "subs %[count], %[count], #8\n" 382 383 "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n" 384 "vld1.32 {d14}, [%[lhs]:64]!\n" 385 "pld [%[lhs], #64]\n" 386 "vmull.u8 q8, d10, d14\n" 387 "vmull.u8 q9, d11, d14\n" 388 "vmull.u8 q10, d12, d14\n" 389 "vmull.u8 q11, d13, d14\n" 390 "vld1.32 {d10}, [%[rhs]:64]!\n" 391 "pld [%[rhs], #128]\n" 392 "vpadal.u16 q0, q8\n" 393 "vpadal.u16 q1, q9\n" 394 "vpadal.u16 q2, q10\n" 395 "vpadal.u16 q3, q11\n" 396 "vmull.u8 q8, d10, d14\n" 397 "vpadal.u16 q4, q8\n" 398 399 // Loop break. 400 "bgt 1b\n" 401 402 // StaticQuantization::Prepare 403 "vld1.32 {d10, d11}, [%[lhs]:64]!\n" 404 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" 405 "vdup.32 q8, %[multiplicative_offset]\n" 406 "vdup.32 q9, %[rounding_offset]\n" 407 "vdup.32 q10, %[shift]\n" 408 "vdup.32 q5, d10[0]\n" 409 410 // RowMajorOutput::Prepare 411 412 // Reduce aggregators. 413 "vpadd.u32 d0, d0, d1\n" 414 "vpadd.u32 d2, d2, d3\n" 415 "vpadd.u32 d4, d4, d5\n" 416 "vpadd.u32 d6, d6, d7\n" 417 "vpadd.u32 d8, d8, d9\n" 418 "vpadd.u32 d0, d0, d2\n" 419 "vpadd.u32 d1, d4, d6\n" 420 "vpadd.u32 d2, d8, d8\n" 421 422 // StaticQuantization::Transform 423 "vadd.s32 q0, q0, q5\n" 424 "vadd.s32 q1, q1, q5\n" 425 "vadd.s32 q0, q0, q6\n" 426 "vadd.s32 q1, q1, q7\n" 427 "vmul.i32 q0, q0, q8\n" 428 "vmul.i32 q1, q1, q8\n" 429 "vadd.i32 q0, q0, q9\n" 430 "vadd.i32 q1, q1, q9\n" 431 "vshl.s32 q0, q0, q10\n" 432 "vshl.s32 q1, q1, q10\n" 433 "vqmovn.s32 d0, q0\n" 434 "vqmovn.s32 d1, q1\n" 435 "vqmovun.s16 d0, q0\n" 436 437 // RowMajorOutput::Output 438 "vst1.32 {d0[0]}, [%[result]]!\n" 439 "vst1.8 {d0[4]}, [%[result]]!\n" 440 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 441 : [count] "r"(params.kernel.count), 442 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 443 [shift] "r"(params.kernel.shift), 444 [stride] "r"(params.output_stream.stride), 445 [rounding_offset] "r"(params.kernel.rounding_offset) 446 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 447 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 448 "d21", "d22", "d23", "cc", "memory"); 449 } 450 451 template <> 452 inline void 453 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)454 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 455 const FusedKernelParams<QuantizedStaticPreprocessed, 456 RowMajor>& params, 457 uint8_t* result) { 458 #ifdef DEBUG 459 #ifdef DEBUG_METAGEMM_VERBOSE 460 std::cout << __FILE__ << "(" << __LINE__ 461 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 462 "QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()" 463 << std::endl 464 << std::flush; 465 #endif 466 #endif 467 asm volatile( 468 "pld [%[lhs]]\n" 469 "pld [%[rhs]]\n" 470 471 // Clear aggregators. 472 "vmov.i32 q0, #0\n" 473 "vmov.i32 q1, #0\n" 474 "vmov.i32 q2, #0\n" 475 "vmov.i32 q3, q0\n" 476 "vmov.i32 q4, q1\n" 477 "vmov.i32 q5, q2\n" 478 479 // General 1xM lanes loop. 480 "1:" 481 482 // Subtract counter. 483 "subs %[count], %[count], #8\n" 484 485 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" 486 "vld1.32 {d16}, [%[lhs]:64]!\n" 487 "pld [%[lhs], #64]\n" 488 "vmull.u8 q9, d12, d16\n" 489 "vmull.u8 q10, d13, d16\n" 490 "vmull.u8 q11, d14, d16\n" 491 "vmull.u8 q12, d15, d16\n" 492 "vld1.32 {d12, d13}, [%[rhs]:64]!\n" 493 "pld [%[rhs], #128]\n" 494 "vpadal.u16 q0, q9\n" 495 "vpadal.u16 q1, q10\n" 496 "vpadal.u16 q2, q11\n" 497 "vpadal.u16 q3, q12\n" 498 "vmull.u8 q9, d12, d16\n" 499 "vmull.u8 q10, d13, d16\n" 500 "vpadal.u16 q4, q9\n" 501 "vpadal.u16 q5, q10\n" 502 503 // Loop break. 504 "bgt 1b\n" 505 506 // StaticQuantization::Prepare 507 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 508 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" 509 "vdup.32 q9, %[multiplicative_offset]\n" 510 "vdup.32 q10, %[rounding_offset]\n" 511 "vdup.32 q11, %[shift]\n" 512 "vdup.32 q6, d12[0]\n" 513 514 // RowMajorOutput::Prepare 515 516 // Reduce aggregators. 517 "vpadd.u32 d0, d0, d1\n" 518 "vpadd.u32 d2, d2, d3\n" 519 "vpadd.u32 d4, d4, d5\n" 520 "vpadd.u32 d6, d6, d7\n" 521 "vpadd.u32 d8, d8, d9\n" 522 "vpadd.u32 d10, d10, d11\n" 523 "vpadd.u32 d0, d0, d2\n" 524 "vpadd.u32 d1, d4, d6\n" 525 "vpadd.u32 d2, d8, d10\n" 526 527 // StaticQuantization::Transform 528 "vadd.s32 q0, q0, q6\n" 529 "vadd.s32 q1, q1, q6\n" 530 "vadd.s32 q0, q0, q7\n" 531 "vadd.s32 q1, q1, q8\n" 532 "vmul.i32 q0, q0, q9\n" 533 "vmul.i32 q1, q1, q9\n" 534 "vadd.i32 q0, q0, q10\n" 535 "vadd.i32 q1, q1, q10\n" 536 "vshl.s32 q0, q0, q11\n" 537 "vshl.s32 q1, q1, q11\n" 538 "vqmovn.s32 d0, q0\n" 539 "vqmovn.s32 d1, q1\n" 540 "vqmovun.s16 d0, q0\n" 541 542 // RowMajorOutput::Output 543 "vst1.32 {d0[0]}, [%[result]]!\n" 544 "vst1.16 {d0[2]}, [%[result]]!\n" 545 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 546 : [count] "r"(params.kernel.count), 547 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 548 [shift] "r"(params.kernel.shift), 549 [stride] "r"(params.output_stream.stride), 550 [rounding_offset] "r"(params.kernel.rounding_offset) 551 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 552 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 553 "d21", "d22", "d23", "d24", "d25", "cc", "memory"); 554 } 555 556 template <> 557 inline void 558 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)559 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 560 const FusedKernelParams<QuantizedStaticPreprocessed, 561 RowMajor>& params, 562 uint8_t* result) { 563 #ifdef DEBUG 564 #ifdef DEBUG_METAGEMM_VERBOSE 565 std::cout << __FILE__ << "(" << __LINE__ 566 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 567 "QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()" 568 << std::endl 569 << std::flush; 570 #endif 571 #endif 572 asm volatile( 573 "pld [%[lhs]]\n" 574 "pld [%[rhs]]\n" 575 576 // Clear aggregators. 577 "vmov.i32 q0, #0\n" 578 "vmov.i32 q1, #0\n" 579 "vmov.i32 q2, #0\n" 580 "vmov.i32 q3, q0\n" 581 "vmov.i32 q4, q1\n" 582 "vmov.i32 q5, q2\n" 583 "vmov.i32 q6, q3\n" 584 585 // General 1xM lanes loop. 586 "1:" 587 588 // Subtract counter. 589 "subs %[count], %[count], #8\n" 590 591 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" 592 "vld1.32 {d18}, [%[lhs]:64]!\n" 593 "pld [%[lhs], #64]\n" 594 "vmull.u8 q10, d14, d18\n" 595 "vmull.u8 q11, d15, d18\n" 596 "vmull.u8 q12, d16, d18\n" 597 "vmull.u8 q13, d17, d18\n" 598 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" 599 "pld [%[rhs], #128]\n" 600 "vpadal.u16 q0, q10\n" 601 "vpadal.u16 q1, q11\n" 602 "vpadal.u16 q2, q12\n" 603 "vpadal.u16 q3, q13\n" 604 "vmull.u8 q10, d14, d18\n" 605 "vmull.u8 q11, d15, d18\n" 606 "vmull.u8 q12, d16, d18\n" 607 "vpadal.u16 q4, q10\n" 608 "vpadal.u16 q5, q11\n" 609 "vpadal.u16 q6, q12\n" 610 611 // Loop break. 612 "bgt 1b\n" 613 614 // StaticQuantization::Prepare 615 "vld1.32 {d14, d15}, [%[lhs]:64]!\n" 616 "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n" 617 "vdup.32 q10, %[multiplicative_offset]\n" 618 "vdup.32 q11, %[rounding_offset]\n" 619 "vdup.32 q12, %[shift]\n" 620 "vdup.32 q7, d14[0]\n" 621 622 // RowMajorOutput::Prepare 623 624 // Reduce aggregators. 625 "vpadd.u32 d0, d0, d1\n" 626 "vpadd.u32 d2, d2, d3\n" 627 "vpadd.u32 d4, d4, d5\n" 628 "vpadd.u32 d6, d6, d7\n" 629 "vpadd.u32 d8, d8, d9\n" 630 "vpadd.u32 d10, d10, d11\n" 631 "vpadd.u32 d12, d12, d13\n" 632 "vpadd.u32 d0, d0, d2\n" 633 "vpadd.u32 d1, d4, d6\n" 634 "vpadd.u32 d2, d8, d10\n" 635 "vpadd.u32 d3, d12, d12\n" 636 637 // StaticQuantization::Transform 638 "vadd.s32 q0, q0, q7\n" 639 "vadd.s32 q1, q1, q7\n" 640 "vadd.s32 q0, q0, q8\n" 641 "vadd.s32 q1, q1, q9\n" 642 "vmul.i32 q0, q0, q10\n" 643 "vmul.i32 q1, q1, q10\n" 644 "vadd.i32 q0, q0, q11\n" 645 "vadd.i32 q1, q1, q11\n" 646 "vshl.s32 q0, q0, q12\n" 647 "vshl.s32 q1, q1, q12\n" 648 "vqmovn.s32 d0, q0\n" 649 "vqmovn.s32 d1, q1\n" 650 "vqmovun.s16 d0, q0\n" 651 652 // RowMajorOutput::Output 653 "vst1.32 {d0[0]}, [%[result]]!\n" 654 "vst1.16 {d0[2]}, [%[result]]!\n" 655 "vst1.8 {d0[6]}, [%[result]]!\n" 656 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 657 : [count] "r"(params.kernel.count), 658 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 659 [shift] "r"(params.kernel.shift), 660 [stride] "r"(params.output_stream.stride), 661 [rounding_offset] "r"(params.kernel.rounding_offset) 662 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 663 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 664 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory"); 665 } 666 667 template <> 668 inline void 669 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)670 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 671 const FusedKernelParams<QuantizedStaticPreprocessed, 672 RowMajor>& params, 673 uint8_t* result) { 674 #ifdef DEBUG 675 #ifdef DEBUG_METAGEMM_VERBOSE 676 std::cout << __FILE__ << "(" << __LINE__ 677 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 678 "QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()" 679 << std::endl 680 << std::flush; 681 #endif 682 #endif 683 asm volatile( 684 "pld [%[lhs]]\n" 685 "pld [%[rhs]]\n" 686 687 // Clear aggregators. 688 "vmov.i32 q0, #0\n" 689 "vmov.i32 q1, #0\n" 690 "vmov.i32 q2, #0\n" 691 "vmov.i32 q3, q0\n" 692 "vmov.i32 q4, q1\n" 693 "vmov.i32 q5, q2\n" 694 "vmov.i32 q6, q3\n" 695 "vmov.i32 q7, q4\n" 696 697 // 1x8 lanes loop. 698 "1:" 699 700 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" 701 "vld1.32 {d16}, [%[lhs]:64]!\n" 702 "vmull.u8 q11, d16, d17\n" 703 "vmull.u8 q12, d16, d18\n" 704 "vmull.u8 q13, d16, d19\n" 705 "vmull.u8 q14, d16, d20\n" 706 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" 707 "vpadal.u16 q0, q11\n" 708 "vpadal.u16 q1, q12\n" 709 "vpadal.u16 q2, q13\n" 710 "vpadal.u16 q3, q14\n" 711 "pld [%[rhs], #256]\n" 712 "vmull.u8 q15, d16, d17\n" 713 "vmull.u8 q11, d16, d18\n" 714 "vmull.u8 q12, d16, d19\n" 715 "vmull.u8 q13, d16, d20\n" 716 "pld [%[lhs], #32]\n" 717 718 // Subtract counter. 719 "subs %[count], %[count], #8\n" 720 721 "vpadal.u16 q4, q15\n" 722 "vpadal.u16 q5, q11\n" 723 "vpadal.u16 q6, q12\n" 724 "vpadal.u16 q7, q13\n" 725 726 // Loop break. 727 "bgt 1b\n" 728 729 // StaticQuantization::Prepare 730 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" 731 "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n" 732 "vdup.32 q11, %[multiplicative_offset]\n" 733 "vdup.32 q12, %[rounding_offset]\n" 734 "vdup.32 q13, %[shift]\n" 735 "vdup.32 q8, d16[0]\n" 736 737 // RowMajorOutput::Prepare 738 739 // Reduce aggregators. 740 "vpadd.u32 d0, d0, d1\n" 741 "vpadd.u32 d2, d2, d3\n" 742 "vpadd.u32 d4, d4, d5\n" 743 "vpadd.u32 d6, d6, d7\n" 744 "vpadd.u32 d8, d8, d9\n" 745 "vpadd.u32 d10, d10, d11\n" 746 "vpadd.u32 d12, d12, d13\n" 747 "vpadd.u32 d14, d14, d15\n" 748 "vpadd.u32 d0, d0, d2\n" 749 "vpadd.u32 d1, d4, d6\n" 750 "vpadd.u32 d2, d8, d10\n" 751 "vpadd.u32 d3, d12, d14\n" 752 753 // StaticQuantization::Transform 754 "vadd.s32 q0, q0, q8\n" 755 "vadd.s32 q1, q1, q8\n" 756 "vadd.s32 q0, q0, q9\n" 757 "vadd.s32 q1, q1, q10\n" 758 "vmul.i32 q0, q0, q11\n" 759 "vmul.i32 q1, q1, q11\n" 760 "vadd.i32 q0, q0, q12\n" 761 "vadd.i32 q1, q1, q12\n" 762 "vshl.s32 q0, q0, q13\n" 763 "vshl.s32 q1, q1, q13\n" 764 "vqmovn.s32 d0, q0\n" 765 "vqmovn.s32 d1, q1\n" 766 "vqmovun.s16 d0, q0\n" 767 768 // RowMajorOutput::Output 769 "vst1.32 {d0}, [%[result]]!\n" 770 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 771 : [count] "r"(params.kernel.count), 772 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 773 [shift] "r"(params.kernel.shift), 774 [stride] "r"(params.output_stream.stride), 775 [rounding_offset] "r"(params.kernel.rounding_offset) 776 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 777 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 778 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", 779 "d31", "cc", "memory"); 780 } 781 782 template <> 783 inline void 784 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)785 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 786 const FusedKernelParams<QuantizedStaticPreprocessed, 787 RowMajor>& params, 788 uint8_t* result) { 789 #ifdef DEBUG 790 #ifdef DEBUG_METAGEMM_VERBOSE 791 std::cout << __FILE__ << "(" << __LINE__ 792 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 793 "QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()" 794 << std::endl 795 << std::flush; 796 #endif 797 #endif 798 asm volatile( 799 "pld [%[lhs]]\n" 800 "pld [%[rhs]]\n" 801 802 // Clear aggregators. 803 "vmov.i32 q0, #0\n" 804 "vmov.i32 q1, #0\n" 805 806 // General NxM lanes loop. 807 "1:" 808 809 // Subtract counter. 810 "subs %[count], %[count], #8\n" 811 812 "vld1.32 {d4, d5}, [%[lhs]:64]!\n" 813 "vld1.32 {d6}, [%[rhs]:64]!\n" 814 "pld [%[lhs], #64]\n" 815 "pld [%[rhs], #64]\n" 816 "vmull.u8 q4, d6, d4\n" 817 "vmull.u8 q5, d6, d5\n" 818 "vpadal.u16 q0, q4\n" 819 "vpadal.u16 q1, q5\n" 820 821 // Loop break. 822 "bgt 1b\n" 823 824 // StaticQuantization::Prepare 825 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 826 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 827 "vdup.32 q6, %[multiplicative_offset]\n" 828 "vdup.32 q7, %[rounding_offset]\n" 829 "vdup.32 q8, %[shift]\n" 830 "vdup.32 q2, d8[0]\n" 831 "vdup.32 q4, d8[1]\n" 832 833 // RowMajorOutput::Prepare 834 "add r0, %[result], %[stride]\n" 835 836 // Reduce aggregators. 837 "vpadd.u32 d0, d0, d1\n" 838 "vpadd.u32 d0, d0, d0\n" 839 "vpadd.u32 d2, d2, d3\n" 840 "vpadd.u32 d2, d2, d2\n" 841 842 // StaticQuantization::Transform 843 "vadd.s32 q0, q0, q2\n" 844 "vadd.s32 q1, q1, q4\n" 845 "vadd.s32 q0, q0, q5\n" 846 "vadd.s32 q1, q1, q5\n" 847 "vmul.i32 q0, q0, q6\n" 848 "vmul.i32 q1, q1, q6\n" 849 "vadd.i32 q0, q0, q7\n" 850 "vadd.i32 q1, q1, q7\n" 851 "vshl.s32 q0, q0, q8\n" 852 "vshl.s32 q1, q1, q8\n" 853 "vqmovn.s32 d0, q0\n" 854 "vqmovn.s32 d2, q1\n" 855 "vqmovun.s16 d0, q0\n" 856 "vqmovun.s16 d2, q1\n" 857 858 // RowMajorOutput::Output 859 "vst1.8 {d0[0]}, [%[result]]!\n" 860 "vst1.8 {d2[0]}, [r0]!\n" 861 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 862 : [count] "r"(params.kernel.count), 863 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 864 [shift] "r"(params.kernel.shift), 865 [stride] "r"(params.output_stream.stride), 866 [rounding_offset] "r"(params.kernel.rounding_offset) 867 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", 868 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory"); 869 } 870 871 template <> 872 inline void 873 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)874 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 875 const FusedKernelParams<QuantizedStaticPreprocessed, 876 RowMajor>& params, 877 uint8_t* result) { 878 #ifdef DEBUG 879 #ifdef DEBUG_METAGEMM_VERBOSE 880 std::cout << __FILE__ << "(" << __LINE__ 881 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 882 "QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()" 883 << std::endl 884 << std::flush; 885 #endif 886 #endif 887 asm volatile( 888 "pld [%[lhs]]\n" 889 "pld [%[rhs]]\n" 890 891 // Clear aggregators. 892 "vmov.i32 q0, #0\n" 893 "vmov.i32 q1, #0\n" 894 "vmov.i32 q2, #0\n" 895 "vmov.i32 q3, q0\n" 896 897 // General NxM lanes loop. 898 "1:" 899 900 // Subtract counter. 901 "subs %[count], %[count], #8\n" 902 903 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 904 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 905 "pld [%[lhs], #64]\n" 906 "pld [%[rhs], #64]\n" 907 "vmull.u8 q6, d10, d8\n" 908 "vmull.u8 q7, d11, d8\n" 909 "vmull.u8 q8, d10, d9\n" 910 "vmull.u8 q9, d11, d9\n" 911 "vpadal.u16 q0, q6\n" 912 "vpadal.u16 q1, q7\n" 913 "vpadal.u16 q2, q8\n" 914 "vpadal.u16 q3, q9\n" 915 916 // Loop break. 917 "bgt 1b\n" 918 919 // StaticQuantization::Prepare 920 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 921 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 922 "vdup.32 q6, %[multiplicative_offset]\n" 923 "vdup.32 q7, %[rounding_offset]\n" 924 "vdup.32 q8, %[shift]\n" 925 "vdup.32 q9, d8[0]\n" 926 "vdup.32 q4, d8[1]\n" 927 928 // RowMajorOutput::Prepare 929 "add r0, %[result], %[stride]\n" 930 931 // Reduce aggregators. 932 "vpadd.u32 d0, d0, d1\n" 933 "vpadd.u32 d2, d2, d3\n" 934 "vpadd.u32 d0, d0, d2\n" 935 "vpadd.u32 d4, d4, d5\n" 936 "vpadd.u32 d6, d6, d7\n" 937 "vpadd.u32 d4, d4, d6\n" 938 939 // StaticQuantization::Transform 940 "vadd.s32 q0, q0, q9\n" 941 "vadd.s32 q2, q2, q4\n" 942 "vadd.s32 q0, q0, q5\n" 943 "vadd.s32 q2, q2, q5\n" 944 "vmul.i32 q0, q0, q6\n" 945 "vmul.i32 q2, q2, q6\n" 946 "vadd.i32 q0, q0, q7\n" 947 "vadd.i32 q2, q2, q7\n" 948 "vshl.s32 q0, q0, q8\n" 949 "vshl.s32 q2, q2, q8\n" 950 "vqmovn.s32 d0, q0\n" 951 "vqmovn.s32 d4, q2\n" 952 "vqmovun.s16 d0, q0\n" 953 "vqmovun.s16 d4, q2\n" 954 955 // RowMajorOutput::Output 956 "vst1.16 {d0[0]}, [%[result]]!\n" 957 "vst1.16 {d4[0]}, [r0]!\n" 958 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 959 : [count] "r"(params.kernel.count), 960 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 961 [shift] "r"(params.kernel.shift), 962 [stride] "r"(params.output_stream.stride), 963 [rounding_offset] "r"(params.kernel.rounding_offset) 964 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 965 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc", 966 "memory"); 967 } 968 969 template <> 970 inline void 971 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)972 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 973 const FusedKernelParams<QuantizedStaticPreprocessed, 974 RowMajor>& params, 975 uint8_t* result) { 976 #ifdef DEBUG 977 #ifdef DEBUG_METAGEMM_VERBOSE 978 std::cout << __FILE__ << "(" << __LINE__ 979 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 980 "QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()" 981 << std::endl 982 << std::flush; 983 #endif 984 #endif 985 asm volatile( 986 "pld [%[lhs]]\n" 987 "pld [%[rhs]]\n" 988 989 // Clear aggregators. 990 "vmov.i32 q0, #0\n" 991 "vmov.i32 q1, #0\n" 992 "vmov.i32 q2, #0\n" 993 "vmov.i32 q3, q0\n" 994 "vmov.i32 q4, q1\n" 995 "vmov.i32 q5, q2\n" 996 997 // General NxM lanes loop. 998 "1:" 999 1000 // Subtract counter. 1001 "subs %[count], %[count], #8\n" 1002 1003 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 1004 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" 1005 "pld [%[lhs], #64]\n" 1006 "pld [%[rhs], #64]\n" 1007 "vmull.u8 q9, d14, d12\n" 1008 "vmull.u8 q10, d15, d12\n" 1009 "vmull.u8 q11, d16, d12\n" 1010 "vmull.u8 q12, d14, d13\n" 1011 "vmull.u8 q13, d15, d13\n" 1012 "vmull.u8 q14, d16, d13\n" 1013 "vpadal.u16 q0, q9\n" 1014 "vpadal.u16 q1, q10\n" 1015 "vpadal.u16 q2, q11\n" 1016 "vpadal.u16 q3, q12\n" 1017 "vpadal.u16 q4, q13\n" 1018 "vpadal.u16 q5, q14\n" 1019 1020 // Loop break. 1021 "bgt 1b\n" 1022 1023 // StaticQuantization::Prepare 1024 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 1025 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" 1026 "vdup.32 q8, %[multiplicative_offset]\n" 1027 "vdup.32 q9, %[rounding_offset]\n" 1028 "vdup.32 q10, %[shift]\n" 1029 "vdup.32 q11, d12[0]\n" 1030 "vdup.32 q6, d12[1]\n" 1031 1032 // RowMajorOutput::Prepare 1033 "add r0, %[result], %[stride]\n" 1034 1035 // Reduce aggregators. 1036 "vpadd.u32 d0, d0, d1\n" 1037 "vpadd.u32 d2, d2, d3\n" 1038 "vpadd.u32 d4, d4, d5\n" 1039 "vpadd.u32 d0, d0, d2\n" 1040 "vpadd.u32 d1, d4, d4\n" 1041 "vpadd.u32 d6, d6, d7\n" 1042 "vpadd.u32 d8, d8, d9\n" 1043 "vpadd.u32 d10, d10, d11\n" 1044 "vpadd.u32 d6, d6, d8\n" 1045 "vpadd.u32 d7, d10, d10\n" 1046 1047 // StaticQuantization::Transform 1048 "vadd.s32 q0, q0, q11\n" 1049 "vadd.s32 q3, q3, q6\n" 1050 "vadd.s32 q0, q0, q7\n" 1051 "vadd.s32 q3, q3, q7\n" 1052 "vmul.i32 q0, q0, q8\n" 1053 "vmul.i32 q3, q3, q8\n" 1054 "vadd.i32 q0, q0, q9\n" 1055 "vadd.i32 q3, q3, q9\n" 1056 "vshl.s32 q0, q0, q10\n" 1057 "vshl.s32 q3, q3, q10\n" 1058 "vqmovn.s32 d0, q0\n" 1059 "vqmovn.s32 d6, q3\n" 1060 "vqmovun.s16 d0, q0\n" 1061 "vqmovun.s16 d6, q3\n" 1062 1063 // RowMajorOutput::Output 1064 "vst1.16 {d0[0]}, [%[result]]!\n" 1065 "vst1.8 {d0[2]}, [%[result]]!\n" 1066 "vst1.16 {d6[0]}, [r0]!\n" 1067 "vst1.8 {d6[2]}, [r0]!\n" 1068 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1069 : [count] "r"(params.kernel.count), 1070 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1071 [shift] "r"(params.kernel.shift), 1072 [stride] "r"(params.output_stream.stride), 1073 [rounding_offset] "r"(params.kernel.rounding_offset) 1074 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 1075 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 1076 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc", 1077 "memory"); 1078 } 1079 1080 template <> 1081 inline void 1082 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1083 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1084 const FusedKernelParams<QuantizedStaticPreprocessed, 1085 RowMajor>& params, 1086 uint8_t* result) { 1087 #ifdef DEBUG 1088 #ifdef DEBUG_METAGEMM_VERBOSE 1089 std::cout << __FILE__ << "(" << __LINE__ 1090 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 1091 "QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()" 1092 << std::endl 1093 << std::flush; 1094 #endif 1095 #endif 1096 asm volatile( 1097 "pld [%[lhs]]\n" 1098 "pld [%[rhs]]\n" 1099 1100 // Clear aggregators. 1101 "vmov.i32 q0, #0\n" 1102 "vmov.i32 q1, #0\n" 1103 "vmov.i32 q2, #0\n" 1104 "vmov.i32 q3, q0\n" 1105 "vmov.i32 q4, q1\n" 1106 "vmov.i32 q5, q2\n" 1107 "vmov.i32 q6, q3\n" 1108 "vmov.i32 q7, q4\n" 1109 1110 // 2x4 lanes loop. 1111 "1:" 1112 1113 "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n" 1114 "vld1.8 {d16}, [%[lhs]:64]!\n" 1115 "vmull.u8 q11, d16, d18\n" 1116 "vld1.8 {d17}, [%[lhs]:64]!\n" 1117 "vmull.u8 q12, d16, d19\n" 1118 "pld [%[rhs], #64]\n" 1119 "vmull.u8 q13, d16, d20\n" 1120 "pld [%[lhs], #64]\n" 1121 "vmull.u8 q14, d16, d21\n" 1122 "vmull.u8 q15, d17, d18\n" 1123 "vpadal.u16 q0, q11\n" 1124 "vpadal.u16 q1, q12\n" 1125 "vpadal.u16 q2, q13\n" 1126 "vmull.u8 q11, d17, d19\n" 1127 "vmull.u8 q12, d17, d20\n" 1128 "vmull.u8 q13, d17, d21\n" 1129 1130 // Subtract counter. 1131 "subs %[count], %[count], #8\n" 1132 1133 "vpadal.u16 q3, q14\n" 1134 "vpadal.u16 q4, q15\n" 1135 "vpadal.u16 q5, q11\n" 1136 "vpadal.u16 q6, q12\n" 1137 "vpadal.u16 q7, q13\n" 1138 1139 // Loop break. 1140 "bgt 1b\n" 1141 1142 // StaticQuantization::Prepare 1143 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" 1144 "vld1.32 {d18, d19}, [%[rhs]:64]!\n" 1145 "vdup.32 q10, %[multiplicative_offset]\n" 1146 "vdup.32 q11, %[rounding_offset]\n" 1147 "vdup.32 q12, %[shift]\n" 1148 "vdup.32 q13, d16[0]\n" 1149 "vdup.32 q8, d16[1]\n" 1150 1151 // RowMajorOutput::Prepare 1152 "add r0, %[result], %[stride]\n" 1153 1154 // Reduce aggregators. 1155 "vpadd.u32 d0, d0, d1\n" 1156 "vpadd.u32 d2, d2, d3\n" 1157 "vpadd.u32 d4, d4, d5\n" 1158 "vpadd.u32 d6, d6, d7\n" 1159 "vpadd.u32 d0, d0, d2\n" 1160 "vpadd.u32 d1, d4, d6\n" 1161 "vpadd.u32 d8, d8, d9\n" 1162 "vpadd.u32 d10, d10, d11\n" 1163 "vpadd.u32 d12, d12, d13\n" 1164 "vpadd.u32 d14, d14, d15\n" 1165 "vpadd.u32 d8, d8, d10\n" 1166 "vpadd.u32 d9, d12, d14\n" 1167 1168 // StaticQuantization::Transform 1169 "vadd.s32 q0, q0, q13\n" 1170 "vadd.s32 q4, q4, q8\n" 1171 "vadd.s32 q0, q0, q9\n" 1172 "vadd.s32 q4, q4, q9\n" 1173 "vmul.i32 q0, q0, q10\n" 1174 "vmul.i32 q4, q4, q10\n" 1175 "vadd.i32 q0, q0, q11\n" 1176 "vadd.i32 q4, q4, q11\n" 1177 "vshl.s32 q0, q0, q12\n" 1178 "vshl.s32 q4, q4, q12\n" 1179 "vqmovn.s32 d0, q0\n" 1180 "vqmovn.s32 d8, q4\n" 1181 "vqmovun.s16 d0, q0\n" 1182 "vqmovun.s16 d8, q4\n" 1183 1184 // RowMajorOutput::Output 1185 "vst1.32 {d0[0]}, [%[result]]!\n" 1186 "vst1.32 {d8[0]}, [r0]!\n" 1187 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1188 : [count] "r"(params.kernel.count), 1189 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1190 [shift] "r"(params.kernel.shift), 1191 [stride] "r"(params.output_stream.stride), 1192 [rounding_offset] "r"(params.kernel.rounding_offset) 1193 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 1194 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 1195 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", 1196 "d31", "cc", "memory"); 1197 } 1198 1199 template <> 1200 inline void 1201 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1202 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1203 const FusedKernelParams<QuantizedStaticPreprocessed, 1204 RowMajor>& params, 1205 uint8_t* result) { 1206 #ifdef DEBUG 1207 #ifdef DEBUG_METAGEMM_VERBOSE 1208 std::cout << __FILE__ << "(" << __LINE__ 1209 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 1210 "QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()" 1211 << std::endl 1212 << std::flush; 1213 #endif 1214 #endif 1215 asm volatile( 1216 "pld [%[lhs]]\n" 1217 "pld [%[rhs]]\n" 1218 1219 // Clear aggregators. 1220 "vmov.i32 q0, #0\n" 1221 "vmov.i32 q1, #0\n" 1222 "vmov.i32 q2, #0\n" 1223 1224 // General NxM lanes loop. 1225 "1:" 1226 1227 // Subtract counter. 1228 "subs %[count], %[count], #8\n" 1229 1230 "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n" 1231 "vld1.32 {d9}, [%[rhs]:64]!\n" 1232 "pld [%[lhs], #64]\n" 1233 "pld [%[rhs], #64]\n" 1234 "vmull.u8 q5, d9, d6\n" 1235 "vmull.u8 q6, d9, d7\n" 1236 "vmull.u8 q7, d9, d8\n" 1237 "vpadal.u16 q0, q5\n" 1238 "vpadal.u16 q1, q6\n" 1239 "vpadal.u16 q2, q7\n" 1240 1241 // Loop break. 1242 "bgt 1b\n" 1243 1244 // StaticQuantization::Prepare 1245 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 1246 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 1247 "vdup.32 q6, %[multiplicative_offset]\n" 1248 "vdup.32 q7, %[rounding_offset]\n" 1249 "vdup.32 q8, %[shift]\n" 1250 "vdup.32 q3, d8[0]\n" 1251 "vdup.32 q9, d8[1]\n" 1252 "vdup.32 q4, d9[0]\n" 1253 1254 // RowMajorOutput::Prepare 1255 "add r0, %[result], %[stride]\n" 1256 "add r1, r0, %[stride]\n" 1257 1258 // Reduce aggregators. 1259 "vpadd.u32 d0, d0, d1\n" 1260 "vpadd.u32 d0, d0, d0\n" 1261 "vpadd.u32 d2, d2, d3\n" 1262 "vpadd.u32 d2, d2, d2\n" 1263 "vpadd.u32 d4, d4, d5\n" 1264 "vpadd.u32 d4, d4, d4\n" 1265 1266 // StaticQuantization::Transform 1267 "vadd.s32 q0, q0, q3\n" 1268 "vadd.s32 q1, q1, q9\n" 1269 "vadd.s32 q2, q2, q4\n" 1270 "vadd.s32 q0, q0, q5\n" 1271 "vadd.s32 q1, q1, q5\n" 1272 "vadd.s32 q2, q2, q5\n" 1273 "vmul.i32 q0, q0, q6\n" 1274 "vmul.i32 q1, q1, q6\n" 1275 "vmul.i32 q2, q2, q6\n" 1276 "vadd.i32 q0, q0, q7\n" 1277 "vadd.i32 q1, q1, q7\n" 1278 "vadd.i32 q2, q2, q7\n" 1279 "vshl.s32 q0, q0, q8\n" 1280 "vshl.s32 q1, q1, q8\n" 1281 "vshl.s32 q2, q2, q8\n" 1282 "vqmovn.s32 d0, q0\n" 1283 "vqmovn.s32 d2, q1\n" 1284 "vqmovn.s32 d4, q2\n" 1285 "vqmovun.s16 d0, q0\n" 1286 "vqmovun.s16 d2, q1\n" 1287 "vqmovun.s16 d4, q2\n" 1288 1289 // RowMajorOutput::Output 1290 "vst1.8 {d0[0]}, [%[result]]!\n" 1291 "vst1.8 {d2[0]}, [r0]!\n" 1292 "vst1.8 {d4[0]}, [r1]!\n" 1293 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1294 : [count] "r"(params.kernel.count), 1295 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1296 [shift] "r"(params.kernel.shift), 1297 [stride] "r"(params.output_stream.stride), 1298 [rounding_offset] "r"(params.kernel.rounding_offset) 1299 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 1300 "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", 1301 "cc", "memory"); 1302 } 1303 1304 template <> 1305 inline void 1306 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1307 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1308 const FusedKernelParams<QuantizedStaticPreprocessed, 1309 RowMajor>& params, 1310 uint8_t* result) { 1311 #ifdef DEBUG 1312 #ifdef DEBUG_METAGEMM_VERBOSE 1313 std::cout << __FILE__ << "(" << __LINE__ 1314 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 1315 "QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()" 1316 << std::endl 1317 << std::flush; 1318 #endif 1319 #endif 1320 asm volatile( 1321 "pld [%[lhs]]\n" 1322 "pld [%[rhs]]\n" 1323 1324 // Clear aggregators. 1325 "vmov.i32 q0, #0\n" 1326 "vmov.i32 q1, #0\n" 1327 "vmov.i32 q2, #0\n" 1328 "vmov.i32 q3, q0\n" 1329 "vmov.i32 q4, q1\n" 1330 "vmov.i32 q5, q2\n" 1331 1332 // General NxM lanes loop. 1333 "1:" 1334 1335 // Subtract counter. 1336 "subs %[count], %[count], #8\n" 1337 1338 "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n" 1339 "vld1.32 {d15, d16}, [%[rhs]:64]!\n" 1340 "pld [%[lhs], #64]\n" 1341 "pld [%[rhs], #64]\n" 1342 "vmull.u8 q9, d15, d12\n" 1343 "vmull.u8 q10, d16, d12\n" 1344 "vmull.u8 q11, d15, d13\n" 1345 "vmull.u8 q12, d16, d13\n" 1346 "vmull.u8 q13, d15, d14\n" 1347 "vmull.u8 q14, d16, d14\n" 1348 "vpadal.u16 q0, q9\n" 1349 "vpadal.u16 q1, q10\n" 1350 "vpadal.u16 q2, q11\n" 1351 "vpadal.u16 q3, q12\n" 1352 "vpadal.u16 q4, q13\n" 1353 "vpadal.u16 q5, q14\n" 1354 1355 // Loop break. 1356 "bgt 1b\n" 1357 1358 // StaticQuantization::Prepare 1359 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 1360 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" 1361 "vdup.32 q8, %[multiplicative_offset]\n" 1362 "vdup.32 q9, %[rounding_offset]\n" 1363 "vdup.32 q10, %[shift]\n" 1364 "vdup.32 q11, d12[0]\n" 1365 "vdup.32 q12, d12[1]\n" 1366 "vdup.32 q6, d13[0]\n" 1367 1368 // RowMajorOutput::Prepare 1369 "add r0, %[result], %[stride]\n" 1370 "add r1, r0, %[stride]\n" 1371 1372 // Reduce aggregators. 1373 "vpadd.u32 d0, d0, d1\n" 1374 "vpadd.u32 d2, d2, d3\n" 1375 "vpadd.u32 d0, d0, d2\n" 1376 "vpadd.u32 d4, d4, d5\n" 1377 "vpadd.u32 d6, d6, d7\n" 1378 "vpadd.u32 d4, d4, d6\n" 1379 "vpadd.u32 d8, d8, d9\n" 1380 "vpadd.u32 d10, d10, d11\n" 1381 "vpadd.u32 d8, d8, d10\n" 1382 1383 // StaticQuantization::Transform 1384 "vadd.s32 q0, q0, q11\n" 1385 "vadd.s32 q2, q2, q12\n" 1386 "vadd.s32 q4, q4, q6\n" 1387 "vadd.s32 q0, q0, q7\n" 1388 "vadd.s32 q2, q2, q7\n" 1389 "vadd.s32 q4, q4, q7\n" 1390 "vmul.i32 q0, q0, q8\n" 1391 "vmul.i32 q2, q2, q8\n" 1392 "vmul.i32 q4, q4, q8\n" 1393 "vadd.i32 q0, q0, q9\n" 1394 "vadd.i32 q2, q2, q9\n" 1395 "vadd.i32 q4, q4, q9\n" 1396 "vshl.s32 q0, q0, q10\n" 1397 "vshl.s32 q2, q2, q10\n" 1398 "vshl.s32 q4, q4, q10\n" 1399 "vqmovn.s32 d0, q0\n" 1400 "vqmovn.s32 d4, q2\n" 1401 "vqmovn.s32 d8, q4\n" 1402 "vqmovun.s16 d0, q0\n" 1403 "vqmovun.s16 d4, q2\n" 1404 "vqmovun.s16 d8, q4\n" 1405 1406 // RowMajorOutput::Output 1407 "vst1.16 {d0[0]}, [%[result]]!\n" 1408 "vst1.16 {d4[0]}, [r0]!\n" 1409 "vst1.16 {d8[0]}, [r1]!\n" 1410 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1411 : [count] "r"(params.kernel.count), 1412 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1413 [shift] "r"(params.kernel.shift), 1414 [stride] "r"(params.output_stream.stride), 1415 [rounding_offset] "r"(params.kernel.rounding_offset) 1416 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 1417 "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", 1418 "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", 1419 "cc", "memory"); 1420 } 1421 1422 template <> 1423 inline void 1424 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1425 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1426 const FusedKernelParams<QuantizedStaticPreprocessed, 1427 RowMajor>& params, 1428 uint8_t* result) { 1429 #ifdef DEBUG 1430 #ifdef DEBUG_METAGEMM_VERBOSE 1431 std::cout << __FILE__ << "(" << __LINE__ 1432 << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " 1433 "QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()" 1434 << std::endl 1435 << std::flush; 1436 #endif 1437 #endif 1438 asm volatile( 1439 "pld [%[lhs]]\n" 1440 "pld [%[rhs]]\n" 1441 1442 // Clear aggregators. 1443 "vmov.i32 q0, #0\n" 1444 "vmov.i32 q1, #0\n" 1445 "vmov.i32 q2, #0\n" 1446 "vmov.i32 q3, q0\n" 1447 "vmov.i32 q4, q1\n" 1448 "vmov.i32 q5, q2\n" 1449 "vmov.i32 q6, q3\n" 1450 "vmov.i32 q7, q4\n" 1451 "vmov.i32 q8, q5\n" 1452 1453 // 3x3 lanes loop. 1454 "1:" 1455 1456 "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n" 1457 "vld1.8 {d18}, [%[lhs]:64]!\n" 1458 "vmull.u8 q12, d18, d21\n" 1459 "vld1.8 {d19}, [%[lhs]:64]!\n" 1460 "vmull.u8 q13, d18, d22\n" 1461 "vld1.8 {d20}, [%[lhs]:64]!\n" 1462 "vmull.u8 q14, d18, d23\n" 1463 "pld [%[lhs], #64]\n" 1464 "vmull.u8 q15, d19, d21\n" 1465 "pld [%[rhs], #64]\n" 1466 "vpadal.u16 q0, q12\n" 1467 "vpadal.u16 q1, q13\n" 1468 "vpadal.u16 q2, q14\n" 1469 "vpadal.u16 q3, q15\n" 1470 "vmull.u8 q12, d19, d22\n" 1471 "vmull.u8 q13, d19, d23\n" 1472 "vmull.u8 q14, d20, d21\n" 1473 "vmull.u8 q15, d20, d22\n" 1474 1475 // Subtract counter. 1476 "subs %[count], %[count], #8\n" 1477 1478 "vmull.u8 q9, d20, d23\n" 1479 "vpadal.u16 q4, q12\n" 1480 "vpadal.u16 q5, q13\n" 1481 "vpadal.u16 q6, q14\n" 1482 "vpadal.u16 q7, q15\n" 1483 "vpadal.u16 q8, q9\n" 1484 1485 // Loop break. 1486 "bgt 1b\n" 1487 1488 // StaticQuantization::Prepare 1489 "vld1.32 {d18, d19}, [%[lhs]:64]!\n" 1490 "vld1.32 {d20, d21}, [%[rhs]:64]!\n" 1491 "vdup.32 q11, %[multiplicative_offset]\n" 1492 "vdup.32 q12, %[rounding_offset]\n" 1493 "vdup.32 q13, %[shift]\n" 1494 "vdup.32 q14, d18[0]\n" 1495 "vdup.32 q15, d18[1]\n" 1496 "vdup.32 q9, d19[0]\n" 1497 1498 // RowMajorOutput::Prepare 1499 "add r0, %[result], %[stride]\n" 1500 "add r1, r0, %[stride]\n" 1501 1502 // Reduce aggregators. 1503 "vpadd.u32 d0, d0, d1\n" 1504 "vpadd.u32 d2, d2, d3\n" 1505 "vpadd.u32 d4, d4, d5\n" 1506 "vpadd.u32 d0, d0, d2\n" 1507 "vpadd.u32 d1, d4, d4\n" 1508 "vpadd.u32 d6, d6, d7\n" 1509 "vpadd.u32 d8, d8, d9\n" 1510 "vpadd.u32 d10, d10, d11\n" 1511 "vpadd.u32 d6, d6, d8\n" 1512 "vpadd.u32 d7, d10, d10\n" 1513 "vpadd.u32 d12, d12, d13\n" 1514 "vpadd.u32 d14, d14, d15\n" 1515 "vpadd.u32 d16, d16, d17\n" 1516 "vpadd.u32 d12, d12, d14\n" 1517 "vpadd.u32 d13, d16, d16\n" 1518 1519 // StaticQuantization::Transform 1520 "vadd.s32 q0, q0, q14\n" 1521 "vadd.s32 q3, q3, q15\n" 1522 "vadd.s32 q6, q6, q9\n" 1523 "vadd.s32 q0, q0, q10\n" 1524 "vadd.s32 q3, q3, q10\n" 1525 "vadd.s32 q6, q6, q10\n" 1526 "vmul.i32 q0, q0, q11\n" 1527 "vmul.i32 q3, q3, q11\n" 1528 "vmul.i32 q6, q6, q11\n" 1529 "vadd.i32 q0, q0, q12\n" 1530 "vadd.i32 q3, q3, q12\n" 1531 "vadd.i32 q6, q6, q12\n" 1532 "vshl.s32 q0, q0, q13\n" 1533 "vshl.s32 q3, q3, q13\n" 1534 "vshl.s32 q6, q6, q13\n" 1535 "vqmovn.s32 d0, q0\n" 1536 "vqmovn.s32 d6, q3\n" 1537 "vqmovn.s32 d12, q6\n" 1538 "vqmovun.s16 d0, q0\n" 1539 "vqmovun.s16 d6, q3\n" 1540 "vqmovun.s16 d12, q6\n" 1541 1542 // RowMajorOutput::Output 1543 "vst1.16 {d0[0]}, [%[result]]!\n" 1544 "vst1.8 {d0[2]}, [%[result]]!\n" 1545 "vst1.16 {d6[0]}, [r0]!\n" 1546 "vst1.8 {d6[2]}, [r0]!\n" 1547 "vst1.16 {d12[0]}, [r1]!\n" 1548 "vst1.8 {d12[2]}, [r1]!\n" 1549 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1550 : [count] "r"(params.kernel.count), 1551 [multiplicative_offset] "r"(params.kernel.multiplicative_offset), 1552 [shift] "r"(params.kernel.shift), 1553 [stride] "r"(params.output_stream.stride), 1554 [rounding_offset] "r"(params.kernel.rounding_offset) 1555 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 1556 "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", 1557 "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", 1558 "d30", "d31", "cc", "memory"); 1559 } 1560 1561 template <> 1562 inline void MulKernel< 1563 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1564 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1565 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1566 RowMajor>& params, 1567 int32_t* result) { 1568 #ifdef DEBUG 1569 #ifdef DEBUG_METAGEMM_VERBOSE 1570 std::cout << __FILE__ << "(" << __LINE__ 1571 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1572 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, " 1573 "8>::Multiply()" 1574 << std::endl 1575 << std::flush; 1576 #endif 1577 #endif 1578 asm volatile( 1579 "pld [%[lhs]]\n" 1580 "pld [%[rhs]]\n" 1581 1582 // Clear aggregators. 1583 "vmov.i32 q0, #0\n" 1584 1585 // General NxM lanes loop. 1586 "1:" 1587 1588 // Subtract counter. 1589 "subs %[count], %[count], #8\n" 1590 1591 "vld1.32 {d2}, [%[lhs]:64]!\n" 1592 "vld1.32 {d3}, [%[rhs]:64]!\n" 1593 "pld [%[lhs], #64]\n" 1594 "pld [%[rhs], #64]\n" 1595 "vmull.u8 q2, d3, d2\n" 1596 "vpadal.u16 q0, q2\n" 1597 1598 // Loop break. 1599 "bgt 1b\n" 1600 1601 // StaticQuantizationInt32::Prepare 1602 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 1603 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 1604 "vdup.32 q4, d8[0]\n" 1605 1606 // RowMajorOutput::Prepare 1607 1608 // Reduce aggregators. 1609 "vpadd.u32 d0, d0, d1\n" 1610 "vpadd.u32 d0, d0, d0\n" 1611 1612 // StaticQuantizationInt32::Transform 1613 "vadd.s32 q0, q0, q4\n" 1614 "vadd.s32 q0, q0, q5\n" 1615 1616 // RowMajorOutput::Output 1617 "vst1.32 {d0[0]}, [%[result]]!\n" 1618 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1619 : [count] "r"(params.kernel.count), 1620 [stride] "r"(params.output_stream.stride) 1621 : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "cc", 1622 "memory"); 1623 } 1624 1625 template <> 1626 inline void MulKernel< 1627 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1628 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1629 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1630 RowMajor>& params, 1631 int32_t* result) { 1632 #ifdef DEBUG 1633 #ifdef DEBUG_METAGEMM_VERBOSE 1634 std::cout << __FILE__ << "(" << __LINE__ 1635 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1636 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, " 1637 "8>::Multiply()" 1638 << std::endl 1639 << std::flush; 1640 #endif 1641 #endif 1642 asm volatile( 1643 "pld [%[lhs]]\n" 1644 "pld [%[rhs]]\n" 1645 1646 // Clear aggregators. 1647 "vmov.i32 q0, #0\n" 1648 "vmov.i32 q1, #0\n" 1649 1650 // General NxM lanes loop. 1651 "1:" 1652 1653 // Subtract counter. 1654 "subs %[count], %[count], #8\n" 1655 1656 "vld1.32 {d4}, [%[lhs]:64]!\n" 1657 "vld1.32 {d5, d6}, [%[rhs]:64]!\n" 1658 "pld [%[lhs], #64]\n" 1659 "pld [%[rhs], #64]\n" 1660 "vmull.u8 q4, d5, d4\n" 1661 "vmull.u8 q5, d6, d4\n" 1662 "vpadal.u16 q0, q4\n" 1663 "vpadal.u16 q1, q5\n" 1664 1665 // Loop break. 1666 "bgt 1b\n" 1667 1668 // StaticQuantizationInt32::Prepare 1669 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 1670 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 1671 "vdup.32 q4, d8[0]\n" 1672 1673 // RowMajorOutput::Prepare 1674 1675 // Reduce aggregators. 1676 "vpadd.u32 d0, d0, d1\n" 1677 "vpadd.u32 d2, d2, d3\n" 1678 "vpadd.u32 d0, d0, d2\n" 1679 1680 // StaticQuantizationInt32::Transform 1681 "vadd.s32 q0, q0, q4\n" 1682 "vadd.s32 q0, q0, q5\n" 1683 1684 // RowMajorOutput::Output 1685 "vst1.32 {d0}, [%[result]]!\n" 1686 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1687 : [count] "r"(params.kernel.count), 1688 [stride] "r"(params.output_stream.stride) 1689 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11", 1690 "cc", "memory"); 1691 } 1692 1693 template <> 1694 inline void MulKernel< 1695 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1696 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1697 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1698 RowMajor>& params, 1699 int32_t* result) { 1700 #ifdef DEBUG 1701 #ifdef DEBUG_METAGEMM_VERBOSE 1702 std::cout << __FILE__ << "(" << __LINE__ 1703 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1704 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, " 1705 "8>::Multiply()" 1706 << std::endl 1707 << std::flush; 1708 #endif 1709 #endif 1710 asm volatile( 1711 "pld [%[lhs]]\n" 1712 "pld [%[rhs]]\n" 1713 1714 // Clear aggregators. 1715 "vmov.i32 q0, #0\n" 1716 "vmov.i32 q1, #0\n" 1717 "vmov.i32 q2, #0\n" 1718 1719 // General NxM lanes loop. 1720 "1:" 1721 1722 // Subtract counter. 1723 "subs %[count], %[count], #8\n" 1724 1725 "vld1.32 {d6}, [%[lhs]:64]!\n" 1726 "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n" 1727 "pld [%[lhs], #64]\n" 1728 "pld [%[rhs], #64]\n" 1729 "vmull.u8 q5, d7, d6\n" 1730 "vmull.u8 q6, d8, d6\n" 1731 "vmull.u8 q7, d9, d6\n" 1732 "vpadal.u16 q0, q5\n" 1733 "vpadal.u16 q1, q6\n" 1734 "vpadal.u16 q2, q7\n" 1735 1736 // Loop break. 1737 "bgt 1b\n" 1738 1739 // StaticQuantizationInt32::Prepare 1740 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 1741 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 1742 "vdup.32 q4, d8[0]\n" 1743 1744 // RowMajorOutput::Prepare 1745 1746 // Reduce aggregators. 1747 "vpadd.u32 d0, d0, d1\n" 1748 "vpadd.u32 d2, d2, d3\n" 1749 "vpadd.u32 d4, d4, d5\n" 1750 "vpadd.u32 d0, d0, d2\n" 1751 "vpadd.u32 d1, d4, d4\n" 1752 1753 // StaticQuantizationInt32::Transform 1754 "vadd.s32 q0, q0, q4\n" 1755 "vadd.s32 q0, q0, q5\n" 1756 1757 // RowMajorOutput::Output 1758 "vst1.32 {d0}, [%[result]]!\n" 1759 "vst1.32 {d1[0]}, [%[result]]!\n" 1760 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1761 : [count] "r"(params.kernel.count), 1762 [stride] "r"(params.output_stream.stride) 1763 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 1764 "d11", "d12", "d13", "d14", "d15", "cc", "memory"); 1765 } 1766 1767 template <> 1768 inline void MulKernel< 1769 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1770 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1771 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1772 RowMajor>& params, 1773 int32_t* result) { 1774 #ifdef DEBUG 1775 #ifdef DEBUG_METAGEMM_VERBOSE 1776 std::cout << __FILE__ << "(" << __LINE__ 1777 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1778 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, " 1779 "8>::Multiply()" 1780 << std::endl 1781 << std::flush; 1782 #endif 1783 #endif 1784 asm volatile( 1785 "pld [%[lhs]]\n" 1786 "pld [%[rhs]]\n" 1787 1788 // Clear aggregators. 1789 "vmov.i32 q0, #0\n" 1790 "vmov.i32 q1, #0\n" 1791 "vmov.i32 q2, #0\n" 1792 "vmov.i32 q3, q0\n" 1793 1794 // General NxM lanes loop. 1795 "1:" 1796 1797 // Subtract counter. 1798 "subs %[count], %[count], #8\n" 1799 1800 "vld1.32 {d8}, [%[lhs]:64]!\n" 1801 "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n" 1802 "pld [%[lhs], #64]\n" 1803 "pld [%[rhs], #64]\n" 1804 "vmull.u8 q7, d9, d8\n" 1805 "vmull.u8 q8, d10, d8\n" 1806 "vmull.u8 q9, d11, d8\n" 1807 "vmull.u8 q10, d12, d8\n" 1808 "vpadal.u16 q0, q7\n" 1809 "vpadal.u16 q1, q8\n" 1810 "vpadal.u16 q2, q9\n" 1811 "vpadal.u16 q3, q10\n" 1812 1813 // Loop break. 1814 "bgt 1b\n" 1815 1816 // StaticQuantizationInt32::Prepare 1817 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 1818 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 1819 "vdup.32 q4, d8[0]\n" 1820 1821 // RowMajorOutput::Prepare 1822 1823 // Reduce aggregators. 1824 "vpadd.u32 d0, d0, d1\n" 1825 "vpadd.u32 d2, d2, d3\n" 1826 "vpadd.u32 d4, d4, d5\n" 1827 "vpadd.u32 d6, d6, d7\n" 1828 "vpadd.u32 d0, d0, d2\n" 1829 "vpadd.u32 d1, d4, d6\n" 1830 1831 // StaticQuantizationInt32::Transform 1832 "vadd.s32 q0, q0, q4\n" 1833 "vadd.s32 q0, q0, q5\n" 1834 1835 // RowMajorOutput::Output 1836 "vst1.32 {d0, d1}, [%[result]]!\n" 1837 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1838 : [count] "r"(params.kernel.count), 1839 [stride] "r"(params.output_stream.stride) 1840 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 1841 "d11", "d12", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", 1842 "cc", "memory"); 1843 } 1844 1845 template <> 1846 inline void MulKernel< 1847 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1848 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1849 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1850 RowMajor>& params, 1851 int32_t* result) { 1852 #ifdef DEBUG 1853 #ifdef DEBUG_METAGEMM_VERBOSE 1854 std::cout << __FILE__ << "(" << __LINE__ 1855 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1856 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, " 1857 "8>::Multiply()" 1858 << std::endl 1859 << std::flush; 1860 #endif 1861 #endif 1862 asm volatile( 1863 "pld [%[lhs]]\n" 1864 "pld [%[rhs]]\n" 1865 1866 // Clear aggregators. 1867 "vmov.i32 q0, #0\n" 1868 "vmov.i32 q1, #0\n" 1869 "vmov.i32 q2, #0\n" 1870 "vmov.i32 q3, q0\n" 1871 "vmov.i32 q4, q1\n" 1872 1873 // General 1xM lanes loop. 1874 "1:" 1875 1876 // Subtract counter. 1877 "subs %[count], %[count], #8\n" 1878 1879 "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n" 1880 "vld1.32 {d14}, [%[lhs]:64]!\n" 1881 "pld [%[lhs], #64]\n" 1882 "vmull.u8 q8, d10, d14\n" 1883 "vmull.u8 q9, d11, d14\n" 1884 "vmull.u8 q10, d12, d14\n" 1885 "vmull.u8 q11, d13, d14\n" 1886 "vld1.32 {d10}, [%[rhs]:64]!\n" 1887 "pld [%[rhs], #128]\n" 1888 "vpadal.u16 q0, q8\n" 1889 "vpadal.u16 q1, q9\n" 1890 "vpadal.u16 q2, q10\n" 1891 "vpadal.u16 q3, q11\n" 1892 "vmull.u8 q8, d10, d14\n" 1893 "vpadal.u16 q4, q8\n" 1894 1895 // Loop break. 1896 "bgt 1b\n" 1897 1898 // StaticQuantizationInt32::Prepare 1899 "vld1.32 {d10, d11}, [%[lhs]:64]!\n" 1900 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" 1901 "vdup.32 q5, d10[0]\n" 1902 1903 // RowMajorOutput::Prepare 1904 1905 // Reduce aggregators. 1906 "vpadd.u32 d0, d0, d1\n" 1907 "vpadd.u32 d2, d2, d3\n" 1908 "vpadd.u32 d4, d4, d5\n" 1909 "vpadd.u32 d6, d6, d7\n" 1910 "vpadd.u32 d8, d8, d9\n" 1911 "vpadd.u32 d0, d0, d2\n" 1912 "vpadd.u32 d1, d4, d6\n" 1913 "vpadd.u32 d2, d8, d8\n" 1914 1915 // StaticQuantizationInt32::Transform 1916 "vadd.s32 q0, q0, q5\n" 1917 "vadd.s32 q1, q1, q5\n" 1918 "vadd.s32 q0, q0, q6\n" 1919 "vadd.s32 q1, q1, q7\n" 1920 1921 // RowMajorOutput::Output 1922 "vst1.32 {d0, d1}, [%[result]]!\n" 1923 "vst1.32 {d2[0]}, [%[result]]!\n" 1924 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 1925 : [count] "r"(params.kernel.count), 1926 [stride] "r"(params.output_stream.stride) 1927 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 1928 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 1929 "d21", "d22", "d23", "cc", "memory"); 1930 } 1931 1932 template <> 1933 inline void MulKernel< 1934 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1935 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 1936 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 1937 RowMajor>& params, 1938 int32_t* result) { 1939 #ifdef DEBUG 1940 #ifdef DEBUG_METAGEMM_VERBOSE 1941 std::cout << __FILE__ << "(" << __LINE__ 1942 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 1943 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, " 1944 "8>::Multiply()" 1945 << std::endl 1946 << std::flush; 1947 #endif 1948 #endif 1949 asm volatile( 1950 "pld [%[lhs]]\n" 1951 "pld [%[rhs]]\n" 1952 1953 // Clear aggregators. 1954 "vmov.i32 q0, #0\n" 1955 "vmov.i32 q1, #0\n" 1956 "vmov.i32 q2, #0\n" 1957 "vmov.i32 q3, q0\n" 1958 "vmov.i32 q4, q1\n" 1959 "vmov.i32 q5, q2\n" 1960 1961 // General 1xM lanes loop. 1962 "1:" 1963 1964 // Subtract counter. 1965 "subs %[count], %[count], #8\n" 1966 1967 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" 1968 "vld1.32 {d16}, [%[lhs]:64]!\n" 1969 "pld [%[lhs], #64]\n" 1970 "vmull.u8 q9, d12, d16\n" 1971 "vmull.u8 q10, d13, d16\n" 1972 "vmull.u8 q11, d14, d16\n" 1973 "vmull.u8 q12, d15, d16\n" 1974 "vld1.32 {d12, d13}, [%[rhs]:64]!\n" 1975 "pld [%[rhs], #128]\n" 1976 "vpadal.u16 q0, q9\n" 1977 "vpadal.u16 q1, q10\n" 1978 "vpadal.u16 q2, q11\n" 1979 "vpadal.u16 q3, q12\n" 1980 "vmull.u8 q9, d12, d16\n" 1981 "vmull.u8 q10, d13, d16\n" 1982 "vpadal.u16 q4, q9\n" 1983 "vpadal.u16 q5, q10\n" 1984 1985 // Loop break. 1986 "bgt 1b\n" 1987 1988 // StaticQuantizationInt32::Prepare 1989 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 1990 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" 1991 "vdup.32 q6, d12[0]\n" 1992 1993 // RowMajorOutput::Prepare 1994 1995 // Reduce aggregators. 1996 "vpadd.u32 d0, d0, d1\n" 1997 "vpadd.u32 d2, d2, d3\n" 1998 "vpadd.u32 d4, d4, d5\n" 1999 "vpadd.u32 d6, d6, d7\n" 2000 "vpadd.u32 d8, d8, d9\n" 2001 "vpadd.u32 d10, d10, d11\n" 2002 "vpadd.u32 d0, d0, d2\n" 2003 "vpadd.u32 d1, d4, d6\n" 2004 "vpadd.u32 d2, d8, d10\n" 2005 2006 // StaticQuantizationInt32::Transform 2007 "vadd.s32 q0, q0, q6\n" 2008 "vadd.s32 q1, q1, q6\n" 2009 "vadd.s32 q0, q0, q7\n" 2010 "vadd.s32 q1, q1, q8\n" 2011 2012 // RowMajorOutput::Output 2013 "vst1.32 {d0, d1, d2}, [%[result]]!\n" 2014 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2015 : [count] "r"(params.kernel.count), 2016 [stride] "r"(params.output_stream.stride) 2017 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 2018 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 2019 "d21", "d22", "d23", "d24", "d25", "cc", "memory"); 2020 } 2021 2022 template <> 2023 inline void MulKernel< 2024 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2025 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2026 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2027 RowMajor>& params, 2028 int32_t* result) { 2029 #ifdef DEBUG 2030 #ifdef DEBUG_METAGEMM_VERBOSE 2031 std::cout << __FILE__ << "(" << __LINE__ 2032 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2033 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, " 2034 "8>::Multiply()" 2035 << std::endl 2036 << std::flush; 2037 #endif 2038 #endif 2039 asm volatile( 2040 "pld [%[lhs]]\n" 2041 "pld [%[rhs]]\n" 2042 2043 // Clear aggregators. 2044 "vmov.i32 q0, #0\n" 2045 "vmov.i32 q1, #0\n" 2046 "vmov.i32 q2, #0\n" 2047 "vmov.i32 q3, q0\n" 2048 "vmov.i32 q4, q1\n" 2049 "vmov.i32 q5, q2\n" 2050 "vmov.i32 q6, q3\n" 2051 2052 // General 1xM lanes loop. 2053 "1:" 2054 2055 // Subtract counter. 2056 "subs %[count], %[count], #8\n" 2057 2058 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" 2059 "vld1.32 {d18}, [%[lhs]:64]!\n" 2060 "pld [%[lhs], #64]\n" 2061 "vmull.u8 q10, d14, d18\n" 2062 "vmull.u8 q11, d15, d18\n" 2063 "vmull.u8 q12, d16, d18\n" 2064 "vmull.u8 q13, d17, d18\n" 2065 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" 2066 "pld [%[rhs], #128]\n" 2067 "vpadal.u16 q0, q10\n" 2068 "vpadal.u16 q1, q11\n" 2069 "vpadal.u16 q2, q12\n" 2070 "vpadal.u16 q3, q13\n" 2071 "vmull.u8 q10, d14, d18\n" 2072 "vmull.u8 q11, d15, d18\n" 2073 "vmull.u8 q12, d16, d18\n" 2074 "vpadal.u16 q4, q10\n" 2075 "vpadal.u16 q5, q11\n" 2076 "vpadal.u16 q6, q12\n" 2077 2078 // Loop break. 2079 "bgt 1b\n" 2080 2081 // StaticQuantizationInt32::Prepare 2082 "vld1.32 {d14, d15}, [%[lhs]:64]!\n" 2083 "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n" 2084 "vdup.32 q7, d14[0]\n" 2085 2086 // RowMajorOutput::Prepare 2087 2088 // Reduce aggregators. 2089 "vpadd.u32 d0, d0, d1\n" 2090 "vpadd.u32 d2, d2, d3\n" 2091 "vpadd.u32 d4, d4, d5\n" 2092 "vpadd.u32 d6, d6, d7\n" 2093 "vpadd.u32 d8, d8, d9\n" 2094 "vpadd.u32 d10, d10, d11\n" 2095 "vpadd.u32 d12, d12, d13\n" 2096 "vpadd.u32 d0, d0, d2\n" 2097 "vpadd.u32 d1, d4, d6\n" 2098 "vpadd.u32 d2, d8, d10\n" 2099 "vpadd.u32 d3, d12, d12\n" 2100 2101 // StaticQuantizationInt32::Transform 2102 "vadd.s32 q0, q0, q7\n" 2103 "vadd.s32 q1, q1, q7\n" 2104 "vadd.s32 q0, q0, q8\n" 2105 "vadd.s32 q1, q1, q9\n" 2106 2107 // RowMajorOutput::Output 2108 "vst1.32 {d0, d1, d2}, [%[result]]!\n" 2109 "vst1.32 {d3[0]}, [%[result]]!\n" 2110 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2111 : [count] "r"(params.kernel.count), 2112 [stride] "r"(params.output_stream.stride) 2113 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 2114 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 2115 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory"); 2116 } 2117 2118 template <> 2119 inline void MulKernel< 2120 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2121 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2122 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2123 RowMajor>& params, 2124 int32_t* result) { 2125 #ifdef DEBUG 2126 #ifdef DEBUG_METAGEMM_VERBOSE 2127 std::cout << __FILE__ << "(" << __LINE__ 2128 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2129 "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, " 2130 "8>::Multiply()" 2131 << std::endl 2132 << std::flush; 2133 #endif 2134 #endif 2135 asm volatile( 2136 "pld [%[lhs]]\n" 2137 "pld [%[rhs]]\n" 2138 2139 // Clear aggregators. 2140 "vmov.i32 q0, #0\n" 2141 "vmov.i32 q1, #0\n" 2142 "vmov.i32 q2, #0\n" 2143 "vmov.i32 q3, q0\n" 2144 "vmov.i32 q4, q1\n" 2145 "vmov.i32 q5, q2\n" 2146 "vmov.i32 q6, q3\n" 2147 "vmov.i32 q7, q4\n" 2148 2149 // 1x8 lanes loop. 2150 "1:" 2151 2152 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" 2153 "vld1.32 {d16}, [%[lhs]:64]!\n" 2154 "vmull.u8 q11, d16, d17\n" 2155 "vmull.u8 q12, d16, d18\n" 2156 "vmull.u8 q13, d16, d19\n" 2157 "vmull.u8 q14, d16, d20\n" 2158 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" 2159 "vpadal.u16 q0, q11\n" 2160 "vpadal.u16 q1, q12\n" 2161 "vpadal.u16 q2, q13\n" 2162 "vpadal.u16 q3, q14\n" 2163 "pld [%[rhs], #256]\n" 2164 "vmull.u8 q15, d16, d17\n" 2165 "vmull.u8 q11, d16, d18\n" 2166 "vmull.u8 q12, d16, d19\n" 2167 "vmull.u8 q13, d16, d20\n" 2168 "pld [%[lhs], #32]\n" 2169 2170 // Subtract counter. 2171 "subs %[count], %[count], #8\n" 2172 2173 "vpadal.u16 q4, q15\n" 2174 "vpadal.u16 q5, q11\n" 2175 "vpadal.u16 q6, q12\n" 2176 "vpadal.u16 q7, q13\n" 2177 2178 // Loop break. 2179 "bgt 1b\n" 2180 2181 // StaticQuantizationInt32::Prepare 2182 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" 2183 "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n" 2184 "vdup.32 q8, d16[0]\n" 2185 2186 // RowMajorOutput::Prepare 2187 2188 // Reduce aggregators. 2189 "vpadd.u32 d0, d0, d1\n" 2190 "vpadd.u32 d2, d2, d3\n" 2191 "vpadd.u32 d4, d4, d5\n" 2192 "vpadd.u32 d6, d6, d7\n" 2193 "vpadd.u32 d8, d8, d9\n" 2194 "vpadd.u32 d10, d10, d11\n" 2195 "vpadd.u32 d12, d12, d13\n" 2196 "vpadd.u32 d14, d14, d15\n" 2197 "vpadd.u32 d0, d0, d2\n" 2198 "vpadd.u32 d1, d4, d6\n" 2199 "vpadd.u32 d2, d8, d10\n" 2200 "vpadd.u32 d3, d12, d14\n" 2201 2202 // StaticQuantizationInt32::Transform 2203 "vadd.s32 q0, q0, q8\n" 2204 "vadd.s32 q1, q1, q8\n" 2205 "vadd.s32 q0, q0, q9\n" 2206 "vadd.s32 q1, q1, q10\n" 2207 2208 // RowMajorOutput::Output 2209 "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n" 2210 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2211 : [count] "r"(params.kernel.count), 2212 [stride] "r"(params.output_stream.stride) 2213 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 2214 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 2215 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", 2216 "d31", "cc", "memory"); 2217 } 2218 2219 template <> 2220 inline void MulKernel< 2221 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2222 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2223 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2224 RowMajor>& params, 2225 int32_t* result) { 2226 #ifdef DEBUG 2227 #ifdef DEBUG_METAGEMM_VERBOSE 2228 std::cout << __FILE__ << "(" << __LINE__ 2229 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2230 "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, " 2231 "8>::Multiply()" 2232 << std::endl 2233 << std::flush; 2234 #endif 2235 #endif 2236 asm volatile( 2237 "pld [%[lhs]]\n" 2238 "pld [%[rhs]]\n" 2239 2240 // Clear aggregators. 2241 "vmov.i32 q0, #0\n" 2242 "vmov.i32 q1, #0\n" 2243 2244 // General NxM lanes loop. 2245 "1:" 2246 2247 // Subtract counter. 2248 "subs %[count], %[count], #8\n" 2249 2250 "vld1.32 {d4, d5}, [%[lhs]:64]!\n" 2251 "vld1.32 {d6}, [%[rhs]:64]!\n" 2252 "pld [%[lhs], #64]\n" 2253 "pld [%[rhs], #64]\n" 2254 "vmull.u8 q4, d6, d4\n" 2255 "vmull.u8 q5, d6, d5\n" 2256 "vpadal.u16 q0, q4\n" 2257 "vpadal.u16 q1, q5\n" 2258 2259 // Loop break. 2260 "bgt 1b\n" 2261 2262 // StaticQuantizationInt32::Prepare 2263 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 2264 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 2265 "vdup.32 q2, d8[0]\n" 2266 "vdup.32 q4, d8[1]\n" 2267 2268 // RowMajorOutput::Prepare 2269 "add r0, %[result], %[stride]\n" 2270 2271 // Reduce aggregators. 2272 "vpadd.u32 d0, d0, d1\n" 2273 "vpadd.u32 d0, d0, d0\n" 2274 "vpadd.u32 d2, d2, d3\n" 2275 "vpadd.u32 d2, d2, d2\n" 2276 2277 // StaticQuantizationInt32::Transform 2278 "vadd.s32 q0, q0, q2\n" 2279 "vadd.s32 q1, q1, q4\n" 2280 "vadd.s32 q0, q0, q5\n" 2281 "vadd.s32 q1, q1, q5\n" 2282 2283 // RowMajorOutput::Output 2284 "vst1.32 {d0[0]}, [%[result]]!\n" 2285 "vst1.32 {d2[0]}, [r0]!\n" 2286 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2287 : [count] "r"(params.kernel.count), 2288 [stride] "r"(params.output_stream.stride) 2289 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", 2290 "d11", "cc", "memory"); 2291 } 2292 2293 template <> 2294 inline void MulKernel< 2295 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2296 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2297 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2298 RowMajor>& params, 2299 int32_t* result) { 2300 #ifdef DEBUG 2301 #ifdef DEBUG_METAGEMM_VERBOSE 2302 std::cout << __FILE__ << "(" << __LINE__ 2303 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2304 "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, " 2305 "8>::Multiply()" 2306 << std::endl 2307 << std::flush; 2308 #endif 2309 #endif 2310 asm volatile( 2311 "pld [%[lhs]]\n" 2312 "pld [%[rhs]]\n" 2313 2314 // Clear aggregators. 2315 "vmov.i32 q0, #0\n" 2316 "vmov.i32 q1, #0\n" 2317 "vmov.i32 q2, #0\n" 2318 "vmov.i32 q3, q0\n" 2319 2320 // General NxM lanes loop. 2321 "1:" 2322 2323 // Subtract counter. 2324 "subs %[count], %[count], #8\n" 2325 2326 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 2327 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 2328 "pld [%[lhs], #64]\n" 2329 "pld [%[rhs], #64]\n" 2330 "vmull.u8 q6, d10, d8\n" 2331 "vmull.u8 q7, d11, d8\n" 2332 "vmull.u8 q8, d10, d9\n" 2333 "vmull.u8 q9, d11, d9\n" 2334 "vpadal.u16 q0, q6\n" 2335 "vpadal.u16 q1, q7\n" 2336 "vpadal.u16 q2, q8\n" 2337 "vpadal.u16 q3, q9\n" 2338 2339 // Loop break. 2340 "bgt 1b\n" 2341 2342 // StaticQuantizationInt32::Prepare 2343 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 2344 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 2345 "vdup.32 q6, d8[0]\n" 2346 "vdup.32 q4, d8[1]\n" 2347 2348 // RowMajorOutput::Prepare 2349 "add r0, %[result], %[stride]\n" 2350 2351 // Reduce aggregators. 2352 "vpadd.u32 d0, d0, d1\n" 2353 "vpadd.u32 d2, d2, d3\n" 2354 "vpadd.u32 d0, d0, d2\n" 2355 "vpadd.u32 d4, d4, d5\n" 2356 "vpadd.u32 d6, d6, d7\n" 2357 "vpadd.u32 d4, d4, d6\n" 2358 2359 // StaticQuantizationInt32::Transform 2360 "vadd.s32 q0, q0, q6\n" 2361 "vadd.s32 q2, q2, q4\n" 2362 "vadd.s32 q0, q0, q5\n" 2363 "vadd.s32 q2, q2, q5\n" 2364 2365 // RowMajorOutput::Output 2366 "vst1.32 {d0}, [%[result]]!\n" 2367 "vst1.32 {d4}, [r0]!\n" 2368 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2369 : [count] "r"(params.kernel.count), 2370 [stride] "r"(params.output_stream.stride) 2371 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 2372 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc", 2373 "memory"); 2374 } 2375 2376 template <> 2377 inline void MulKernel< 2378 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2379 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2380 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2381 RowMajor>& params, 2382 int32_t* result) { 2383 #ifdef DEBUG 2384 #ifdef DEBUG_METAGEMM_VERBOSE 2385 std::cout << __FILE__ << "(" << __LINE__ 2386 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2387 "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, " 2388 "8>::Multiply()" 2389 << std::endl 2390 << std::flush; 2391 #endif 2392 #endif 2393 asm volatile( 2394 "pld [%[lhs]]\n" 2395 "pld [%[rhs]]\n" 2396 2397 // Clear aggregators. 2398 "vmov.i32 q0, #0\n" 2399 "vmov.i32 q1, #0\n" 2400 "vmov.i32 q2, #0\n" 2401 "vmov.i32 q3, q0\n" 2402 "vmov.i32 q4, q1\n" 2403 "vmov.i32 q5, q2\n" 2404 2405 // General NxM lanes loop. 2406 "1:" 2407 2408 // Subtract counter. 2409 "subs %[count], %[count], #8\n" 2410 2411 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 2412 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" 2413 "pld [%[lhs], #64]\n" 2414 "pld [%[rhs], #64]\n" 2415 "vmull.u8 q9, d14, d12\n" 2416 "vmull.u8 q10, d15, d12\n" 2417 "vmull.u8 q11, d16, d12\n" 2418 "vmull.u8 q12, d14, d13\n" 2419 "vmull.u8 q13, d15, d13\n" 2420 "vmull.u8 q14, d16, d13\n" 2421 "vpadal.u16 q0, q9\n" 2422 "vpadal.u16 q1, q10\n" 2423 "vpadal.u16 q2, q11\n" 2424 "vpadal.u16 q3, q12\n" 2425 "vpadal.u16 q4, q13\n" 2426 "vpadal.u16 q5, q14\n" 2427 2428 // Loop break. 2429 "bgt 1b\n" 2430 2431 // StaticQuantizationInt32::Prepare 2432 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 2433 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" 2434 "vdup.32 q8, d12[0]\n" 2435 "vdup.32 q6, d12[1]\n" 2436 2437 // RowMajorOutput::Prepare 2438 "add r0, %[result], %[stride]\n" 2439 2440 // Reduce aggregators. 2441 "vpadd.u32 d0, d0, d1\n" 2442 "vpadd.u32 d2, d2, d3\n" 2443 "vpadd.u32 d4, d4, d5\n" 2444 "vpadd.u32 d0, d0, d2\n" 2445 "vpadd.u32 d1, d4, d4\n" 2446 "vpadd.u32 d6, d6, d7\n" 2447 "vpadd.u32 d8, d8, d9\n" 2448 "vpadd.u32 d10, d10, d11\n" 2449 "vpadd.u32 d6, d6, d8\n" 2450 "vpadd.u32 d7, d10, d10\n" 2451 2452 // StaticQuantizationInt32::Transform 2453 "vadd.s32 q0, q0, q8\n" 2454 "vadd.s32 q3, q3, q6\n" 2455 "vadd.s32 q0, q0, q7\n" 2456 "vadd.s32 q3, q3, q7\n" 2457 2458 // RowMajorOutput::Output 2459 "vst1.32 {d0}, [%[result]]!\n" 2460 "vst1.32 {d1[0]}, [%[result]]!\n" 2461 "vst1.32 {d6}, [r0]!\n" 2462 "vst1.32 {d7[0]}, [r0]!\n" 2463 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2464 : [count] "r"(params.kernel.count), 2465 [stride] "r"(params.output_stream.stride) 2466 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 2467 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 2468 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc", 2469 "memory"); 2470 } 2471 2472 template <> 2473 inline void MulKernel< 2474 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2475 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2476 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2477 RowMajor>& params, 2478 int32_t* result) { 2479 #ifdef DEBUG 2480 #ifdef DEBUG_METAGEMM_VERBOSE 2481 std::cout << __FILE__ << "(" << __LINE__ 2482 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2483 "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, " 2484 "8>::Multiply()" 2485 << std::endl 2486 << std::flush; 2487 #endif 2488 #endif 2489 asm volatile( 2490 "pld [%[lhs]]\n" 2491 "pld [%[rhs]]\n" 2492 2493 // Clear aggregators. 2494 "vmov.i32 q0, #0\n" 2495 "vmov.i32 q1, #0\n" 2496 "vmov.i32 q2, #0\n" 2497 "vmov.i32 q3, q0\n" 2498 "vmov.i32 q4, q1\n" 2499 "vmov.i32 q5, q2\n" 2500 "vmov.i32 q6, q3\n" 2501 "vmov.i32 q7, q4\n" 2502 2503 // 2x4 lanes loop. 2504 "1:" 2505 2506 "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n" 2507 "vld1.8 {d16}, [%[lhs]:64]!\n" 2508 "vmull.u8 q11, d16, d18\n" 2509 "vld1.8 {d17}, [%[lhs]:64]!\n" 2510 "vmull.u8 q12, d16, d19\n" 2511 "pld [%[rhs], #64]\n" 2512 "vmull.u8 q13, d16, d20\n" 2513 "pld [%[lhs], #64]\n" 2514 "vmull.u8 q14, d16, d21\n" 2515 "vmull.u8 q15, d17, d18\n" 2516 "vpadal.u16 q0, q11\n" 2517 "vpadal.u16 q1, q12\n" 2518 "vpadal.u16 q2, q13\n" 2519 "vmull.u8 q11, d17, d19\n" 2520 "vmull.u8 q12, d17, d20\n" 2521 "vmull.u8 q13, d17, d21\n" 2522 2523 // Subtract counter. 2524 "subs %[count], %[count], #8\n" 2525 2526 "vpadal.u16 q3, q14\n" 2527 "vpadal.u16 q4, q15\n" 2528 "vpadal.u16 q5, q11\n" 2529 "vpadal.u16 q6, q12\n" 2530 "vpadal.u16 q7, q13\n" 2531 2532 // Loop break. 2533 "bgt 1b\n" 2534 2535 // StaticQuantizationInt32::Prepare 2536 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" 2537 "vld1.32 {d18, d19}, [%[rhs]:64]!\n" 2538 "vdup.32 q10, d16[0]\n" 2539 "vdup.32 q8, d16[1]\n" 2540 2541 // RowMajorOutput::Prepare 2542 "add r0, %[result], %[stride]\n" 2543 2544 // Reduce aggregators. 2545 "vpadd.u32 d0, d0, d1\n" 2546 "vpadd.u32 d2, d2, d3\n" 2547 "vpadd.u32 d4, d4, d5\n" 2548 "vpadd.u32 d6, d6, d7\n" 2549 "vpadd.u32 d0, d0, d2\n" 2550 "vpadd.u32 d1, d4, d6\n" 2551 "vpadd.u32 d8, d8, d9\n" 2552 "vpadd.u32 d10, d10, d11\n" 2553 "vpadd.u32 d12, d12, d13\n" 2554 "vpadd.u32 d14, d14, d15\n" 2555 "vpadd.u32 d8, d8, d10\n" 2556 "vpadd.u32 d9, d12, d14\n" 2557 2558 // StaticQuantizationInt32::Transform 2559 "vadd.s32 q0, q0, q10\n" 2560 "vadd.s32 q4, q4, q8\n" 2561 "vadd.s32 q0, q0, q9\n" 2562 "vadd.s32 q4, q4, q9\n" 2563 2564 // RowMajorOutput::Output 2565 "vst1.32 {d0, d1}, [%[result]]!\n" 2566 "vst1.32 {d8, d9}, [r0]!\n" 2567 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2568 : [count] "r"(params.kernel.count), 2569 [stride] "r"(params.output_stream.stride) 2570 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 2571 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 2572 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", 2573 "d31", "cc", "memory"); 2574 } 2575 2576 template <> 2577 inline void MulKernel< 2578 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2579 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2580 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2581 RowMajor>& params, 2582 int32_t* result) { 2583 #ifdef DEBUG 2584 #ifdef DEBUG_METAGEMM_VERBOSE 2585 std::cout << __FILE__ << "(" << __LINE__ 2586 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2587 "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, " 2588 "8>::Multiply()" 2589 << std::endl 2590 << std::flush; 2591 #endif 2592 #endif 2593 asm volatile( 2594 "pld [%[lhs]]\n" 2595 "pld [%[rhs]]\n" 2596 2597 // Clear aggregators. 2598 "vmov.i32 q0, #0\n" 2599 "vmov.i32 q1, #0\n" 2600 "vmov.i32 q2, #0\n" 2601 2602 // General NxM lanes loop. 2603 "1:" 2604 2605 // Subtract counter. 2606 "subs %[count], %[count], #8\n" 2607 2608 "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n" 2609 "vld1.32 {d9}, [%[rhs]:64]!\n" 2610 "pld [%[lhs], #64]\n" 2611 "pld [%[rhs], #64]\n" 2612 "vmull.u8 q5, d9, d6\n" 2613 "vmull.u8 q6, d9, d7\n" 2614 "vmull.u8 q7, d9, d8\n" 2615 "vpadal.u16 q0, q5\n" 2616 "vpadal.u16 q1, q6\n" 2617 "vpadal.u16 q2, q7\n" 2618 2619 // Loop break. 2620 "bgt 1b\n" 2621 2622 // StaticQuantizationInt32::Prepare 2623 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 2624 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 2625 "vdup.32 q3, d8[0]\n" 2626 "vdup.32 q6, d8[1]\n" 2627 "vdup.32 q4, d9[0]\n" 2628 2629 // RowMajorOutput::Prepare 2630 "add r0, %[result], %[stride]\n" 2631 "add r1, r0, %[stride]\n" 2632 2633 // Reduce aggregators. 2634 "vpadd.u32 d0, d0, d1\n" 2635 "vpadd.u32 d0, d0, d0\n" 2636 "vpadd.u32 d2, d2, d3\n" 2637 "vpadd.u32 d2, d2, d2\n" 2638 "vpadd.u32 d4, d4, d5\n" 2639 "vpadd.u32 d4, d4, d4\n" 2640 2641 // StaticQuantizationInt32::Transform 2642 "vadd.s32 q0, q0, q3\n" 2643 "vadd.s32 q1, q1, q6\n" 2644 "vadd.s32 q2, q2, q4\n" 2645 "vadd.s32 q0, q0, q5\n" 2646 "vadd.s32 q1, q1, q5\n" 2647 "vadd.s32 q2, q2, q5\n" 2648 2649 // RowMajorOutput::Output 2650 "vst1.32 {d0[0]}, [%[result]]!\n" 2651 "vst1.32 {d2[0]}, [r0]!\n" 2652 "vst1.32 {d4[0]}, [r1]!\n" 2653 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2654 : [count] "r"(params.kernel.count), 2655 [stride] "r"(params.output_stream.stride) 2656 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 2657 "d10", "d11", "d12", "d13", "d14", "d15", "cc", "memory"); 2658 } 2659 2660 template <> 2661 inline void MulKernel< 2662 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2663 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2664 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2665 RowMajor>& params, 2666 int32_t* result) { 2667 #ifdef DEBUG 2668 #ifdef DEBUG_METAGEMM_VERBOSE 2669 std::cout << __FILE__ << "(" << __LINE__ 2670 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2671 "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, " 2672 "8>::Multiply()" 2673 << std::endl 2674 << std::flush; 2675 #endif 2676 #endif 2677 asm volatile( 2678 "pld [%[lhs]]\n" 2679 "pld [%[rhs]]\n" 2680 2681 // Clear aggregators. 2682 "vmov.i32 q0, #0\n" 2683 "vmov.i32 q1, #0\n" 2684 "vmov.i32 q2, #0\n" 2685 "vmov.i32 q3, q0\n" 2686 "vmov.i32 q4, q1\n" 2687 "vmov.i32 q5, q2\n" 2688 2689 // General NxM lanes loop. 2690 "1:" 2691 2692 // Subtract counter. 2693 "subs %[count], %[count], #8\n" 2694 2695 "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n" 2696 "vld1.32 {d15, d16}, [%[rhs]:64]!\n" 2697 "pld [%[lhs], #64]\n" 2698 "pld [%[rhs], #64]\n" 2699 "vmull.u8 q9, d15, d12\n" 2700 "vmull.u8 q10, d16, d12\n" 2701 "vmull.u8 q11, d15, d13\n" 2702 "vmull.u8 q12, d16, d13\n" 2703 "vmull.u8 q13, d15, d14\n" 2704 "vmull.u8 q14, d16, d14\n" 2705 "vpadal.u16 q0, q9\n" 2706 "vpadal.u16 q1, q10\n" 2707 "vpadal.u16 q2, q11\n" 2708 "vpadal.u16 q3, q12\n" 2709 "vpadal.u16 q4, q13\n" 2710 "vpadal.u16 q5, q14\n" 2711 2712 // Loop break. 2713 "bgt 1b\n" 2714 2715 // StaticQuantizationInt32::Prepare 2716 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 2717 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" 2718 "vdup.32 q8, d12[0]\n" 2719 "vdup.32 q9, d12[1]\n" 2720 "vdup.32 q6, d13[0]\n" 2721 2722 // RowMajorOutput::Prepare 2723 "add r0, %[result], %[stride]\n" 2724 "add r1, r0, %[stride]\n" 2725 2726 // Reduce aggregators. 2727 "vpadd.u32 d0, d0, d1\n" 2728 "vpadd.u32 d2, d2, d3\n" 2729 "vpadd.u32 d0, d0, d2\n" 2730 "vpadd.u32 d4, d4, d5\n" 2731 "vpadd.u32 d6, d6, d7\n" 2732 "vpadd.u32 d4, d4, d6\n" 2733 "vpadd.u32 d8, d8, d9\n" 2734 "vpadd.u32 d10, d10, d11\n" 2735 "vpadd.u32 d8, d8, d10\n" 2736 2737 // StaticQuantizationInt32::Transform 2738 "vadd.s32 q0, q0, q8\n" 2739 "vadd.s32 q2, q2, q9\n" 2740 "vadd.s32 q4, q4, q6\n" 2741 "vadd.s32 q0, q0, q7\n" 2742 "vadd.s32 q2, q2, q7\n" 2743 "vadd.s32 q4, q4, q7\n" 2744 2745 // RowMajorOutput::Output 2746 "vst1.32 {d0}, [%[result]]!\n" 2747 "vst1.32 {d4}, [r0]!\n" 2748 "vst1.32 {d8}, [r1]!\n" 2749 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2750 : [count] "r"(params.kernel.count), 2751 [stride] "r"(params.output_stream.stride) 2752 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 2753 "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", 2754 "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", 2755 "cc", "memory"); 2756 } 2757 2758 template <> 2759 inline void MulKernel< 2760 uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2761 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2762 const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, 2763 RowMajor>& params, 2764 int32_t* result) { 2765 #ifdef DEBUG 2766 #ifdef DEBUG_METAGEMM_VERBOSE 2767 std::cout << __FILE__ << "(" << __LINE__ 2768 << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " 2769 "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, " 2770 "8>::Multiply()" 2771 << std::endl 2772 << std::flush; 2773 #endif 2774 #endif 2775 asm volatile( 2776 "pld [%[lhs]]\n" 2777 "pld [%[rhs]]\n" 2778 2779 // Clear aggregators. 2780 "vmov.i32 q0, #0\n" 2781 "vmov.i32 q1, #0\n" 2782 "vmov.i32 q2, #0\n" 2783 "vmov.i32 q3, q0\n" 2784 "vmov.i32 q4, q1\n" 2785 "vmov.i32 q5, q2\n" 2786 "vmov.i32 q6, q3\n" 2787 "vmov.i32 q7, q4\n" 2788 "vmov.i32 q8, q5\n" 2789 2790 // 3x3 lanes loop. 2791 "1:" 2792 2793 "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n" 2794 "vld1.8 {d18}, [%[lhs]:64]!\n" 2795 "vmull.u8 q12, d18, d21\n" 2796 "vld1.8 {d19}, [%[lhs]:64]!\n" 2797 "vmull.u8 q13, d18, d22\n" 2798 "vld1.8 {d20}, [%[lhs]:64]!\n" 2799 "vmull.u8 q14, d18, d23\n" 2800 "pld [%[lhs], #64]\n" 2801 "vmull.u8 q15, d19, d21\n" 2802 "pld [%[rhs], #64]\n" 2803 "vpadal.u16 q0, q12\n" 2804 "vpadal.u16 q1, q13\n" 2805 "vpadal.u16 q2, q14\n" 2806 "vpadal.u16 q3, q15\n" 2807 "vmull.u8 q12, d19, d22\n" 2808 "vmull.u8 q13, d19, d23\n" 2809 "vmull.u8 q14, d20, d21\n" 2810 "vmull.u8 q15, d20, d22\n" 2811 2812 // Subtract counter. 2813 "subs %[count], %[count], #8\n" 2814 2815 "vmull.u8 q9, d20, d23\n" 2816 "vpadal.u16 q4, q12\n" 2817 "vpadal.u16 q5, q13\n" 2818 "vpadal.u16 q6, q14\n" 2819 "vpadal.u16 q7, q15\n" 2820 "vpadal.u16 q8, q9\n" 2821 2822 // Loop break. 2823 "bgt 1b\n" 2824 2825 // StaticQuantizationInt32::Prepare 2826 "vld1.32 {d18, d19}, [%[lhs]:64]!\n" 2827 "vld1.32 {d20, d21}, [%[rhs]:64]!\n" 2828 "vdup.32 q11, d18[0]\n" 2829 "vdup.32 q12, d18[1]\n" 2830 "vdup.32 q9, d19[0]\n" 2831 2832 // RowMajorOutput::Prepare 2833 "add r0, %[result], %[stride]\n" 2834 "add r1, r0, %[stride]\n" 2835 2836 // Reduce aggregators. 2837 "vpadd.u32 d0, d0, d1\n" 2838 "vpadd.u32 d2, d2, d3\n" 2839 "vpadd.u32 d4, d4, d5\n" 2840 "vpadd.u32 d0, d0, d2\n" 2841 "vpadd.u32 d1, d4, d4\n" 2842 "vpadd.u32 d6, d6, d7\n" 2843 "vpadd.u32 d8, d8, d9\n" 2844 "vpadd.u32 d10, d10, d11\n" 2845 "vpadd.u32 d6, d6, d8\n" 2846 "vpadd.u32 d7, d10, d10\n" 2847 "vpadd.u32 d12, d12, d13\n" 2848 "vpadd.u32 d14, d14, d15\n" 2849 "vpadd.u32 d16, d16, d17\n" 2850 "vpadd.u32 d12, d12, d14\n" 2851 "vpadd.u32 d13, d16, d16\n" 2852 2853 // StaticQuantizationInt32::Transform 2854 "vadd.s32 q0, q0, q11\n" 2855 "vadd.s32 q3, q3, q12\n" 2856 "vadd.s32 q6, q6, q9\n" 2857 "vadd.s32 q0, q0, q10\n" 2858 "vadd.s32 q3, q3, q10\n" 2859 "vadd.s32 q6, q6, q10\n" 2860 2861 // RowMajorOutput::Output 2862 "vst1.32 {d0}, [%[result]]!\n" 2863 "vst1.32 {d1[0]}, [%[result]]!\n" 2864 "vst1.32 {d6}, [r0]!\n" 2865 "vst1.32 {d7[0]}, [r0]!\n" 2866 "vst1.32 {d12}, [r1]!\n" 2867 "vst1.32 {d13[0]}, [r1]!\n" 2868 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2869 : [count] "r"(params.kernel.count), 2870 [stride] "r"(params.output_stream.stride) 2871 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 2872 "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", 2873 "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", 2874 "d30", "d31", "cc", "memory"); 2875 } 2876 2877 template <> 2878 inline void MulKernel< 2879 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2880 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2881 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 2882 RowMajor>& params, 2883 float* result) { 2884 #ifdef DEBUG 2885 #ifdef DEBUG_METAGEMM_VERBOSE 2886 std::cout << __FILE__ << "(" << __LINE__ 2887 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 2888 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, " 2889 "8>::Multiply()" 2890 << std::endl 2891 << std::flush; 2892 #endif 2893 #endif 2894 asm volatile( 2895 "pld [%[lhs]]\n" 2896 "pld [%[rhs]]\n" 2897 2898 // Clear aggregators. 2899 "vmov.i32 q0, #0\n" 2900 2901 // General NxM lanes loop. 2902 "1:" 2903 2904 // Subtract counter. 2905 "subs %[count], %[count], #8\n" 2906 2907 "vld1.32 {d2}, [%[lhs]:64]!\n" 2908 "vld1.32 {d3}, [%[rhs]:64]!\n" 2909 "pld [%[lhs], #64]\n" 2910 "pld [%[rhs], #64]\n" 2911 "vmull.u8 q2, d3, d2\n" 2912 "vpadal.u16 q0, q2\n" 2913 2914 // Loop break. 2915 "bgt 1b\n" 2916 2917 // StaticQuantizationFloat::Prepare 2918 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 2919 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 2920 "vdup.32 q6, %[scale]\n" 2921 "vdup.32 q4, d8[0]\n" 2922 2923 // RowMajorOutput::Prepare 2924 2925 // Reduce aggregators. 2926 "vpadd.u32 d0, d0, d1\n" 2927 "vpadd.u32 d0, d0, d0\n" 2928 2929 // StaticQuantizationFloat::Transform 2930 "vadd.s32 q0, q0, q4\n" 2931 "vadd.s32 q0, q0, q5\n" 2932 "vcvt.f32.s32 q0, q0\n" 2933 "vmul.f32 q0, q0, q6\n" 2934 2935 // RowMajorOutput::Output 2936 "vst1.32 {d0[0]}, [%[result]]!\n" 2937 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 2938 : [count] "r"(params.kernel.count), 2939 [stride] "r"(params.output_stream.stride), 2940 [scale] "r"(params.kernel.scale) 2941 : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "d12", 2942 "d13", "cc", "memory"); 2943 } 2944 2945 template <> 2946 inline void MulKernel< 2947 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2948 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 2949 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 2950 RowMajor>& params, 2951 float* result) { 2952 #ifdef DEBUG 2953 #ifdef DEBUG_METAGEMM_VERBOSE 2954 std::cout << __FILE__ << "(" << __LINE__ 2955 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 2956 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, " 2957 "8>::Multiply()" 2958 << std::endl 2959 << std::flush; 2960 #endif 2961 #endif 2962 asm volatile( 2963 "pld [%[lhs]]\n" 2964 "pld [%[rhs]]\n" 2965 2966 // Clear aggregators. 2967 "vmov.i32 q0, #0\n" 2968 "vmov.i32 q1, #0\n" 2969 2970 // General NxM lanes loop. 2971 "1:" 2972 2973 // Subtract counter. 2974 "subs %[count], %[count], #8\n" 2975 2976 "vld1.32 {d4}, [%[lhs]:64]!\n" 2977 "vld1.32 {d5, d6}, [%[rhs]:64]!\n" 2978 "pld [%[lhs], #64]\n" 2979 "pld [%[rhs], #64]\n" 2980 "vmull.u8 q4, d5, d4\n" 2981 "vmull.u8 q5, d6, d4\n" 2982 "vpadal.u16 q0, q4\n" 2983 "vpadal.u16 q1, q5\n" 2984 2985 // Loop break. 2986 "bgt 1b\n" 2987 2988 // StaticQuantizationFloat::Prepare 2989 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 2990 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 2991 "vdup.32 q6, %[scale]\n" 2992 "vdup.32 q4, d8[0]\n" 2993 2994 // RowMajorOutput::Prepare 2995 2996 // Reduce aggregators. 2997 "vpadd.u32 d0, d0, d1\n" 2998 "vpadd.u32 d2, d2, d3\n" 2999 "vpadd.u32 d0, d0, d2\n" 3000 3001 // StaticQuantizationFloat::Transform 3002 "vadd.s32 q0, q0, q4\n" 3003 "vadd.s32 q0, q0, q5\n" 3004 "vcvt.f32.s32 q0, q0\n" 3005 "vmul.f32 q0, q0, q6\n" 3006 3007 // RowMajorOutput::Output 3008 "vst1.32 {d0}, [%[result]]!\n" 3009 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3010 : [count] "r"(params.kernel.count), 3011 [stride] "r"(params.output_stream.stride), 3012 [scale] "r"(params.kernel.scale) 3013 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11", 3014 "d12", "d13", "cc", "memory"); 3015 } 3016 3017 template <> 3018 inline void MulKernel< 3019 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3020 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3021 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3022 RowMajor>& params, 3023 float* result) { 3024 #ifdef DEBUG 3025 #ifdef DEBUG_METAGEMM_VERBOSE 3026 std::cout << __FILE__ << "(" << __LINE__ 3027 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3028 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, " 3029 "8>::Multiply()" 3030 << std::endl 3031 << std::flush; 3032 #endif 3033 #endif 3034 asm volatile( 3035 "pld [%[lhs]]\n" 3036 "pld [%[rhs]]\n" 3037 3038 // Clear aggregators. 3039 "vmov.i32 q0, #0\n" 3040 "vmov.i32 q1, #0\n" 3041 "vmov.i32 q2, #0\n" 3042 3043 // General NxM lanes loop. 3044 "1:" 3045 3046 // Subtract counter. 3047 "subs %[count], %[count], #8\n" 3048 3049 "vld1.32 {d6}, [%[lhs]:64]!\n" 3050 "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n" 3051 "pld [%[lhs], #64]\n" 3052 "pld [%[rhs], #64]\n" 3053 "vmull.u8 q5, d7, d6\n" 3054 "vmull.u8 q6, d8, d6\n" 3055 "vmull.u8 q7, d9, d6\n" 3056 "vpadal.u16 q0, q5\n" 3057 "vpadal.u16 q1, q6\n" 3058 "vpadal.u16 q2, q7\n" 3059 3060 // Loop break. 3061 "bgt 1b\n" 3062 3063 // StaticQuantizationFloat::Prepare 3064 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 3065 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 3066 "vdup.32 q6, %[scale]\n" 3067 "vdup.32 q4, d8[0]\n" 3068 3069 // RowMajorOutput::Prepare 3070 3071 // Reduce aggregators. 3072 "vpadd.u32 d0, d0, d1\n" 3073 "vpadd.u32 d2, d2, d3\n" 3074 "vpadd.u32 d4, d4, d5\n" 3075 "vpadd.u32 d0, d0, d2\n" 3076 "vpadd.u32 d1, d4, d4\n" 3077 3078 // StaticQuantizationFloat::Transform 3079 "vadd.s32 q0, q0, q4\n" 3080 "vadd.s32 q0, q0, q5\n" 3081 "vcvt.f32.s32 q0, q0\n" 3082 "vmul.f32 q0, q0, q6\n" 3083 3084 // RowMajorOutput::Output 3085 "vst1.32 {d0}, [%[result]]!\n" 3086 "vst1.32 {d1[0]}, [%[result]]!\n" 3087 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3088 : [count] "r"(params.kernel.count), 3089 [stride] "r"(params.output_stream.stride), 3090 [scale] "r"(params.kernel.scale) 3091 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3092 "d11", "d12", "d13", "d14", "d15", "cc", "memory"); 3093 } 3094 3095 template <> 3096 inline void MulKernel< 3097 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3098 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3099 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3100 RowMajor>& params, 3101 float* result) { 3102 #ifdef DEBUG 3103 #ifdef DEBUG_METAGEMM_VERBOSE 3104 std::cout << __FILE__ << "(" << __LINE__ 3105 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3106 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, " 3107 "8>::Multiply()" 3108 << std::endl 3109 << std::flush; 3110 #endif 3111 #endif 3112 asm volatile( 3113 "pld [%[lhs]]\n" 3114 "pld [%[rhs]]\n" 3115 3116 // Clear aggregators. 3117 "vmov.i32 q0, #0\n" 3118 "vmov.i32 q1, #0\n" 3119 "vmov.i32 q2, #0\n" 3120 "vmov.i32 q3, q0\n" 3121 3122 // General NxM lanes loop. 3123 "1:" 3124 3125 // Subtract counter. 3126 "subs %[count], %[count], #8\n" 3127 3128 "vld1.32 {d8}, [%[lhs]:64]!\n" 3129 "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n" 3130 "pld [%[lhs], #64]\n" 3131 "pld [%[rhs], #64]\n" 3132 "vmull.u8 q7, d9, d8\n" 3133 "vmull.u8 q8, d10, d8\n" 3134 "vmull.u8 q9, d11, d8\n" 3135 "vmull.u8 q10, d12, d8\n" 3136 "vpadal.u16 q0, q7\n" 3137 "vpadal.u16 q1, q8\n" 3138 "vpadal.u16 q2, q9\n" 3139 "vpadal.u16 q3, q10\n" 3140 3141 // Loop break. 3142 "bgt 1b\n" 3143 3144 // StaticQuantizationFloat::Prepare 3145 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 3146 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 3147 "vdup.32 q6, %[scale]\n" 3148 "vdup.32 q4, d8[0]\n" 3149 3150 // RowMajorOutput::Prepare 3151 3152 // Reduce aggregators. 3153 "vpadd.u32 d0, d0, d1\n" 3154 "vpadd.u32 d2, d2, d3\n" 3155 "vpadd.u32 d4, d4, d5\n" 3156 "vpadd.u32 d6, d6, d7\n" 3157 "vpadd.u32 d0, d0, d2\n" 3158 "vpadd.u32 d1, d4, d6\n" 3159 3160 // StaticQuantizationFloat::Transform 3161 "vadd.s32 q0, q0, q4\n" 3162 "vadd.s32 q0, q0, q5\n" 3163 "vcvt.f32.s32 q0, q0\n" 3164 "vmul.f32 q0, q0, q6\n" 3165 3166 // RowMajorOutput::Output 3167 "vst1.32 {d0, d1}, [%[result]]!\n" 3168 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3169 : [count] "r"(params.kernel.count), 3170 [stride] "r"(params.output_stream.stride), 3171 [scale] "r"(params.kernel.scale) 3172 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3173 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 3174 "d21", "cc", "memory"); 3175 } 3176 3177 template <> 3178 inline void MulKernel< 3179 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3180 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3181 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3182 RowMajor>& params, 3183 float* result) { 3184 #ifdef DEBUG 3185 #ifdef DEBUG_METAGEMM_VERBOSE 3186 std::cout << __FILE__ << "(" << __LINE__ 3187 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3188 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, " 3189 "8>::Multiply()" 3190 << std::endl 3191 << std::flush; 3192 #endif 3193 #endif 3194 asm volatile( 3195 "pld [%[lhs]]\n" 3196 "pld [%[rhs]]\n" 3197 3198 // Clear aggregators. 3199 "vmov.i32 q0, #0\n" 3200 "vmov.i32 q1, #0\n" 3201 "vmov.i32 q2, #0\n" 3202 "vmov.i32 q3, q0\n" 3203 "vmov.i32 q4, q1\n" 3204 3205 // General 1xM lanes loop. 3206 "1:" 3207 3208 // Subtract counter. 3209 "subs %[count], %[count], #8\n" 3210 3211 "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n" 3212 "vld1.32 {d14}, [%[lhs]:64]!\n" 3213 "pld [%[lhs], #64]\n" 3214 "vmull.u8 q8, d10, d14\n" 3215 "vmull.u8 q9, d11, d14\n" 3216 "vmull.u8 q10, d12, d14\n" 3217 "vmull.u8 q11, d13, d14\n" 3218 "vld1.32 {d10}, [%[rhs]:64]!\n" 3219 "pld [%[rhs], #128]\n" 3220 "vpadal.u16 q0, q8\n" 3221 "vpadal.u16 q1, q9\n" 3222 "vpadal.u16 q2, q10\n" 3223 "vpadal.u16 q3, q11\n" 3224 "vmull.u8 q8, d10, d14\n" 3225 "vpadal.u16 q4, q8\n" 3226 3227 // Loop break. 3228 "bgt 1b\n" 3229 3230 // StaticQuantizationFloat::Prepare 3231 "vld1.32 {d10, d11}, [%[lhs]:64]!\n" 3232 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" 3233 "vdup.32 q8, %[scale]\n" 3234 "vdup.32 q5, d10[0]\n" 3235 3236 // RowMajorOutput::Prepare 3237 3238 // Reduce aggregators. 3239 "vpadd.u32 d0, d0, d1\n" 3240 "vpadd.u32 d2, d2, d3\n" 3241 "vpadd.u32 d4, d4, d5\n" 3242 "vpadd.u32 d6, d6, d7\n" 3243 "vpadd.u32 d8, d8, d9\n" 3244 "vpadd.u32 d0, d0, d2\n" 3245 "vpadd.u32 d1, d4, d6\n" 3246 "vpadd.u32 d2, d8, d8\n" 3247 3248 // StaticQuantizationFloat::Transform 3249 "vadd.s32 q0, q0, q5\n" 3250 "vadd.s32 q1, q1, q5\n" 3251 "vadd.s32 q0, q0, q6\n" 3252 "vadd.s32 q1, q1, q7\n" 3253 "vcvt.f32.s32 q0, q0\n" 3254 "vcvt.f32.s32 q1, q1\n" 3255 "vmul.f32 q0, q0, q8\n" 3256 "vmul.f32 q1, q1, q8\n" 3257 3258 // RowMajorOutput::Output 3259 "vst1.32 {d0, d1}, [%[result]]!\n" 3260 "vst1.32 {d2[0]}, [%[result]]!\n" 3261 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3262 : [count] "r"(params.kernel.count), 3263 [stride] "r"(params.output_stream.stride), 3264 [scale] "r"(params.kernel.scale) 3265 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3266 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 3267 "d21", "d22", "d23", "cc", "memory"); 3268 } 3269 3270 template <> 3271 inline void MulKernel< 3272 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3273 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3274 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3275 RowMajor>& params, 3276 float* result) { 3277 #ifdef DEBUG 3278 #ifdef DEBUG_METAGEMM_VERBOSE 3279 std::cout << __FILE__ << "(" << __LINE__ 3280 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3281 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, " 3282 "8>::Multiply()" 3283 << std::endl 3284 << std::flush; 3285 #endif 3286 #endif 3287 asm volatile( 3288 "pld [%[lhs]]\n" 3289 "pld [%[rhs]]\n" 3290 3291 // Clear aggregators. 3292 "vmov.i32 q0, #0\n" 3293 "vmov.i32 q1, #0\n" 3294 "vmov.i32 q2, #0\n" 3295 "vmov.i32 q3, q0\n" 3296 "vmov.i32 q4, q1\n" 3297 "vmov.i32 q5, q2\n" 3298 3299 // General 1xM lanes loop. 3300 "1:" 3301 3302 // Subtract counter. 3303 "subs %[count], %[count], #8\n" 3304 3305 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" 3306 "vld1.32 {d16}, [%[lhs]:64]!\n" 3307 "pld [%[lhs], #64]\n" 3308 "vmull.u8 q9, d12, d16\n" 3309 "vmull.u8 q10, d13, d16\n" 3310 "vmull.u8 q11, d14, d16\n" 3311 "vmull.u8 q12, d15, d16\n" 3312 "vld1.32 {d12, d13}, [%[rhs]:64]!\n" 3313 "pld [%[rhs], #128]\n" 3314 "vpadal.u16 q0, q9\n" 3315 "vpadal.u16 q1, q10\n" 3316 "vpadal.u16 q2, q11\n" 3317 "vpadal.u16 q3, q12\n" 3318 "vmull.u8 q9, d12, d16\n" 3319 "vmull.u8 q10, d13, d16\n" 3320 "vpadal.u16 q4, q9\n" 3321 "vpadal.u16 q5, q10\n" 3322 3323 // Loop break. 3324 "bgt 1b\n" 3325 3326 // StaticQuantizationFloat::Prepare 3327 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 3328 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" 3329 "vdup.32 q9, %[scale]\n" 3330 "vdup.32 q6, d12[0]\n" 3331 3332 // RowMajorOutput::Prepare 3333 3334 // Reduce aggregators. 3335 "vpadd.u32 d0, d0, d1\n" 3336 "vpadd.u32 d2, d2, d3\n" 3337 "vpadd.u32 d4, d4, d5\n" 3338 "vpadd.u32 d6, d6, d7\n" 3339 "vpadd.u32 d8, d8, d9\n" 3340 "vpadd.u32 d10, d10, d11\n" 3341 "vpadd.u32 d0, d0, d2\n" 3342 "vpadd.u32 d1, d4, d6\n" 3343 "vpadd.u32 d2, d8, d10\n" 3344 3345 // StaticQuantizationFloat::Transform 3346 "vadd.s32 q0, q0, q6\n" 3347 "vadd.s32 q1, q1, q6\n" 3348 "vadd.s32 q0, q0, q7\n" 3349 "vadd.s32 q1, q1, q8\n" 3350 "vcvt.f32.s32 q0, q0\n" 3351 "vcvt.f32.s32 q1, q1\n" 3352 "vmul.f32 q0, q0, q9\n" 3353 "vmul.f32 q1, q1, q9\n" 3354 3355 // RowMajorOutput::Output 3356 "vst1.32 {d0, d1, d2}, [%[result]]!\n" 3357 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3358 : [count] "r"(params.kernel.count), 3359 [stride] "r"(params.output_stream.stride), 3360 [scale] "r"(params.kernel.scale) 3361 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3362 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 3363 "d21", "d22", "d23", "d24", "d25", "cc", "memory"); 3364 } 3365 3366 template <> 3367 inline void MulKernel< 3368 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3369 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3370 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3371 RowMajor>& params, 3372 float* result) { 3373 #ifdef DEBUG 3374 #ifdef DEBUG_METAGEMM_VERBOSE 3375 std::cout << __FILE__ << "(" << __LINE__ 3376 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3377 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, " 3378 "8>::Multiply()" 3379 << std::endl 3380 << std::flush; 3381 #endif 3382 #endif 3383 asm volatile( 3384 "pld [%[lhs]]\n" 3385 "pld [%[rhs]]\n" 3386 3387 // Clear aggregators. 3388 "vmov.i32 q0, #0\n" 3389 "vmov.i32 q1, #0\n" 3390 "vmov.i32 q2, #0\n" 3391 "vmov.i32 q3, q0\n" 3392 "vmov.i32 q4, q1\n" 3393 "vmov.i32 q5, q2\n" 3394 "vmov.i32 q6, q3\n" 3395 3396 // General 1xM lanes loop. 3397 "1:" 3398 3399 // Subtract counter. 3400 "subs %[count], %[count], #8\n" 3401 3402 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" 3403 "vld1.32 {d18}, [%[lhs]:64]!\n" 3404 "pld [%[lhs], #64]\n" 3405 "vmull.u8 q10, d14, d18\n" 3406 "vmull.u8 q11, d15, d18\n" 3407 "vmull.u8 q12, d16, d18\n" 3408 "vmull.u8 q13, d17, d18\n" 3409 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" 3410 "pld [%[rhs], #128]\n" 3411 "vpadal.u16 q0, q10\n" 3412 "vpadal.u16 q1, q11\n" 3413 "vpadal.u16 q2, q12\n" 3414 "vpadal.u16 q3, q13\n" 3415 "vmull.u8 q10, d14, d18\n" 3416 "vmull.u8 q11, d15, d18\n" 3417 "vmull.u8 q12, d16, d18\n" 3418 "vpadal.u16 q4, q10\n" 3419 "vpadal.u16 q5, q11\n" 3420 "vpadal.u16 q6, q12\n" 3421 3422 // Loop break. 3423 "bgt 1b\n" 3424 3425 // StaticQuantizationFloat::Prepare 3426 "vld1.32 {d14, d15}, [%[lhs]:64]!\n" 3427 "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n" 3428 "vdup.32 q10, %[scale]\n" 3429 "vdup.32 q7, d14[0]\n" 3430 3431 // RowMajorOutput::Prepare 3432 3433 // Reduce aggregators. 3434 "vpadd.u32 d0, d0, d1\n" 3435 "vpadd.u32 d2, d2, d3\n" 3436 "vpadd.u32 d4, d4, d5\n" 3437 "vpadd.u32 d6, d6, d7\n" 3438 "vpadd.u32 d8, d8, d9\n" 3439 "vpadd.u32 d10, d10, d11\n" 3440 "vpadd.u32 d12, d12, d13\n" 3441 "vpadd.u32 d0, d0, d2\n" 3442 "vpadd.u32 d1, d4, d6\n" 3443 "vpadd.u32 d2, d8, d10\n" 3444 "vpadd.u32 d3, d12, d12\n" 3445 3446 // StaticQuantizationFloat::Transform 3447 "vadd.s32 q0, q0, q7\n" 3448 "vadd.s32 q1, q1, q7\n" 3449 "vadd.s32 q0, q0, q8\n" 3450 "vadd.s32 q1, q1, q9\n" 3451 "vcvt.f32.s32 q0, q0\n" 3452 "vcvt.f32.s32 q1, q1\n" 3453 "vmul.f32 q0, q0, q10\n" 3454 "vmul.f32 q1, q1, q10\n" 3455 3456 // RowMajorOutput::Output 3457 "vst1.32 {d0, d1, d2}, [%[result]]!\n" 3458 "vst1.32 {d3[0]}, [%[result]]!\n" 3459 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3460 : [count] "r"(params.kernel.count), 3461 [stride] "r"(params.output_stream.stride), 3462 [scale] "r"(params.kernel.scale) 3463 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3464 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 3465 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory"); 3466 } 3467 3468 template <> 3469 inline void MulKernel< 3470 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3471 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3472 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3473 RowMajor>& params, 3474 float* result) { 3475 #ifdef DEBUG 3476 #ifdef DEBUG_METAGEMM_VERBOSE 3477 std::cout << __FILE__ << "(" << __LINE__ 3478 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3479 "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, " 3480 "8>::Multiply()" 3481 << std::endl 3482 << std::flush; 3483 #endif 3484 #endif 3485 asm volatile( 3486 "pld [%[lhs]]\n" 3487 "pld [%[rhs]]\n" 3488 3489 // Clear aggregators. 3490 "vmov.i32 q0, #0\n" 3491 "vmov.i32 q1, #0\n" 3492 "vmov.i32 q2, #0\n" 3493 "vmov.i32 q3, q0\n" 3494 "vmov.i32 q4, q1\n" 3495 "vmov.i32 q5, q2\n" 3496 "vmov.i32 q6, q3\n" 3497 "vmov.i32 q7, q4\n" 3498 3499 // 1x8 lanes loop. 3500 "1:" 3501 3502 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" 3503 "vld1.32 {d16}, [%[lhs]:64]!\n" 3504 "vmull.u8 q11, d16, d17\n" 3505 "vmull.u8 q12, d16, d18\n" 3506 "vmull.u8 q13, d16, d19\n" 3507 "vmull.u8 q14, d16, d20\n" 3508 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" 3509 "vpadal.u16 q0, q11\n" 3510 "vpadal.u16 q1, q12\n" 3511 "vpadal.u16 q2, q13\n" 3512 "vpadal.u16 q3, q14\n" 3513 "pld [%[rhs], #256]\n" 3514 "vmull.u8 q15, d16, d17\n" 3515 "vmull.u8 q11, d16, d18\n" 3516 "vmull.u8 q12, d16, d19\n" 3517 "vmull.u8 q13, d16, d20\n" 3518 "pld [%[lhs], #32]\n" 3519 3520 // Subtract counter. 3521 "subs %[count], %[count], #8\n" 3522 3523 "vpadal.u16 q4, q15\n" 3524 "vpadal.u16 q5, q11\n" 3525 "vpadal.u16 q6, q12\n" 3526 "vpadal.u16 q7, q13\n" 3527 3528 // Loop break. 3529 "bgt 1b\n" 3530 3531 // StaticQuantizationFloat::Prepare 3532 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" 3533 "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n" 3534 "vdup.32 q11, %[scale]\n" 3535 "vdup.32 q8, d16[0]\n" 3536 3537 // RowMajorOutput::Prepare 3538 3539 // Reduce aggregators. 3540 "vpadd.u32 d0, d0, d1\n" 3541 "vpadd.u32 d2, d2, d3\n" 3542 "vpadd.u32 d4, d4, d5\n" 3543 "vpadd.u32 d6, d6, d7\n" 3544 "vpadd.u32 d8, d8, d9\n" 3545 "vpadd.u32 d10, d10, d11\n" 3546 "vpadd.u32 d12, d12, d13\n" 3547 "vpadd.u32 d14, d14, d15\n" 3548 "vpadd.u32 d0, d0, d2\n" 3549 "vpadd.u32 d1, d4, d6\n" 3550 "vpadd.u32 d2, d8, d10\n" 3551 "vpadd.u32 d3, d12, d14\n" 3552 3553 // StaticQuantizationFloat::Transform 3554 "vadd.s32 q0, q0, q8\n" 3555 "vadd.s32 q1, q1, q8\n" 3556 "vadd.s32 q0, q0, q9\n" 3557 "vadd.s32 q1, q1, q10\n" 3558 "vcvt.f32.s32 q0, q0\n" 3559 "vcvt.f32.s32 q1, q1\n" 3560 "vmul.f32 q0, q0, q11\n" 3561 "vmul.f32 q1, q1, q11\n" 3562 3563 // RowMajorOutput::Output 3564 "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n" 3565 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3566 : [count] "r"(params.kernel.count), 3567 [stride] "r"(params.output_stream.stride), 3568 [scale] "r"(params.kernel.scale) 3569 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3570 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 3571 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", 3572 "d31", "cc", "memory"); 3573 } 3574 3575 template <> 3576 inline void MulKernel< 3577 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3578 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3579 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3580 RowMajor>& params, 3581 float* result) { 3582 #ifdef DEBUG 3583 #ifdef DEBUG_METAGEMM_VERBOSE 3584 std::cout << __FILE__ << "(" << __LINE__ 3585 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3586 "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, " 3587 "8>::Multiply()" 3588 << std::endl 3589 << std::flush; 3590 #endif 3591 #endif 3592 asm volatile( 3593 "pld [%[lhs]]\n" 3594 "pld [%[rhs]]\n" 3595 3596 // Clear aggregators. 3597 "vmov.i32 q0, #0\n" 3598 "vmov.i32 q1, #0\n" 3599 3600 // General NxM lanes loop. 3601 "1:" 3602 3603 // Subtract counter. 3604 "subs %[count], %[count], #8\n" 3605 3606 "vld1.32 {d4, d5}, [%[lhs]:64]!\n" 3607 "vld1.32 {d6}, [%[rhs]:64]!\n" 3608 "pld [%[lhs], #64]\n" 3609 "pld [%[rhs], #64]\n" 3610 "vmull.u8 q4, d6, d4\n" 3611 "vmull.u8 q5, d6, d5\n" 3612 "vpadal.u16 q0, q4\n" 3613 "vpadal.u16 q1, q5\n" 3614 3615 // Loop break. 3616 "bgt 1b\n" 3617 3618 // StaticQuantizationFloat::Prepare 3619 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 3620 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 3621 "vdup.32 q6, %[scale]\n" 3622 "vdup.32 q2, d8[0]\n" 3623 "vdup.32 q4, d8[1]\n" 3624 3625 // RowMajorOutput::Prepare 3626 "add r0, %[result], %[stride]\n" 3627 3628 // Reduce aggregators. 3629 "vpadd.u32 d0, d0, d1\n" 3630 "vpadd.u32 d0, d0, d0\n" 3631 "vpadd.u32 d2, d2, d3\n" 3632 "vpadd.u32 d2, d2, d2\n" 3633 3634 // StaticQuantizationFloat::Transform 3635 "vadd.s32 q0, q0, q2\n" 3636 "vadd.s32 q1, q1, q4\n" 3637 "vadd.s32 q0, q0, q5\n" 3638 "vadd.s32 q1, q1, q5\n" 3639 "vcvt.f32.s32 q0, q0\n" 3640 "vcvt.f32.s32 q1, q1\n" 3641 "vmul.f32 q0, q0, q6\n" 3642 "vmul.f32 q1, q1, q6\n" 3643 3644 // RowMajorOutput::Output 3645 "vst1.32 {d0[0]}, [%[result]]!\n" 3646 "vst1.32 {d2[0]}, [r0]!\n" 3647 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3648 : [count] "r"(params.kernel.count), 3649 [stride] "r"(params.output_stream.stride), 3650 [scale] "r"(params.kernel.scale) 3651 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", 3652 "d11", "d12", "d13", "cc", "memory"); 3653 } 3654 3655 template <> 3656 inline void MulKernel< 3657 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3658 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3659 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3660 RowMajor>& params, 3661 float* result) { 3662 #ifdef DEBUG 3663 #ifdef DEBUG_METAGEMM_VERBOSE 3664 std::cout << __FILE__ << "(" << __LINE__ 3665 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3666 "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, " 3667 "8>::Multiply()" 3668 << std::endl 3669 << std::flush; 3670 #endif 3671 #endif 3672 asm volatile( 3673 "pld [%[lhs]]\n" 3674 "pld [%[rhs]]\n" 3675 3676 // Clear aggregators. 3677 "vmov.i32 q0, #0\n" 3678 "vmov.i32 q1, #0\n" 3679 "vmov.i32 q2, #0\n" 3680 "vmov.i32 q3, q0\n" 3681 3682 // General NxM lanes loop. 3683 "1:" 3684 3685 // Subtract counter. 3686 "subs %[count], %[count], #8\n" 3687 3688 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 3689 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 3690 "pld [%[lhs], #64]\n" 3691 "pld [%[rhs], #64]\n" 3692 "vmull.u8 q6, d10, d8\n" 3693 "vmull.u8 q7, d11, d8\n" 3694 "vmull.u8 q8, d10, d9\n" 3695 "vmull.u8 q9, d11, d9\n" 3696 "vpadal.u16 q0, q6\n" 3697 "vpadal.u16 q1, q7\n" 3698 "vpadal.u16 q2, q8\n" 3699 "vpadal.u16 q3, q9\n" 3700 3701 // Loop break. 3702 "bgt 1b\n" 3703 3704 // StaticQuantizationFloat::Prepare 3705 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 3706 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 3707 "vdup.32 q6, %[scale]\n" 3708 "vdup.32 q7, d8[0]\n" 3709 "vdup.32 q4, d8[1]\n" 3710 3711 // RowMajorOutput::Prepare 3712 "add r0, %[result], %[stride]\n" 3713 3714 // Reduce aggregators. 3715 "vpadd.u32 d0, d0, d1\n" 3716 "vpadd.u32 d2, d2, d3\n" 3717 "vpadd.u32 d0, d0, d2\n" 3718 "vpadd.u32 d4, d4, d5\n" 3719 "vpadd.u32 d6, d6, d7\n" 3720 "vpadd.u32 d4, d4, d6\n" 3721 3722 // StaticQuantizationFloat::Transform 3723 "vadd.s32 q0, q0, q7\n" 3724 "vadd.s32 q2, q2, q4\n" 3725 "vadd.s32 q0, q0, q5\n" 3726 "vadd.s32 q2, q2, q5\n" 3727 "vcvt.f32.s32 q0, q0\n" 3728 "vcvt.f32.s32 q2, q2\n" 3729 "vmul.f32 q0, q0, q6\n" 3730 "vmul.f32 q2, q2, q6\n" 3731 3732 // RowMajorOutput::Output 3733 "vst1.32 {d0}, [%[result]]!\n" 3734 "vst1.32 {d4}, [r0]!\n" 3735 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3736 : [count] "r"(params.kernel.count), 3737 [stride] "r"(params.output_stream.stride), 3738 [scale] "r"(params.kernel.scale) 3739 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3740 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc", 3741 "memory"); 3742 } 3743 3744 template <> 3745 inline void MulKernel< 3746 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3747 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3748 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3749 RowMajor>& params, 3750 float* result) { 3751 #ifdef DEBUG 3752 #ifdef DEBUG_METAGEMM_VERBOSE 3753 std::cout << __FILE__ << "(" << __LINE__ 3754 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3755 "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, " 3756 "8>::Multiply()" 3757 << std::endl 3758 << std::flush; 3759 #endif 3760 #endif 3761 asm volatile( 3762 "pld [%[lhs]]\n" 3763 "pld [%[rhs]]\n" 3764 3765 // Clear aggregators. 3766 "vmov.i32 q0, #0\n" 3767 "vmov.i32 q1, #0\n" 3768 "vmov.i32 q2, #0\n" 3769 "vmov.i32 q3, q0\n" 3770 "vmov.i32 q4, q1\n" 3771 "vmov.i32 q5, q2\n" 3772 3773 // General NxM lanes loop. 3774 "1:" 3775 3776 // Subtract counter. 3777 "subs %[count], %[count], #8\n" 3778 3779 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 3780 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" 3781 "pld [%[lhs], #64]\n" 3782 "pld [%[rhs], #64]\n" 3783 "vmull.u8 q9, d14, d12\n" 3784 "vmull.u8 q10, d15, d12\n" 3785 "vmull.u8 q11, d16, d12\n" 3786 "vmull.u8 q12, d14, d13\n" 3787 "vmull.u8 q13, d15, d13\n" 3788 "vmull.u8 q14, d16, d13\n" 3789 "vpadal.u16 q0, q9\n" 3790 "vpadal.u16 q1, q10\n" 3791 "vpadal.u16 q2, q11\n" 3792 "vpadal.u16 q3, q12\n" 3793 "vpadal.u16 q4, q13\n" 3794 "vpadal.u16 q5, q14\n" 3795 3796 // Loop break. 3797 "bgt 1b\n" 3798 3799 // StaticQuantizationFloat::Prepare 3800 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 3801 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" 3802 "vdup.32 q8, %[scale]\n" 3803 "vdup.32 q9, d12[0]\n" 3804 "vdup.32 q6, d12[1]\n" 3805 3806 // RowMajorOutput::Prepare 3807 "add r0, %[result], %[stride]\n" 3808 3809 // Reduce aggregators. 3810 "vpadd.u32 d0, d0, d1\n" 3811 "vpadd.u32 d2, d2, d3\n" 3812 "vpadd.u32 d4, d4, d5\n" 3813 "vpadd.u32 d0, d0, d2\n" 3814 "vpadd.u32 d1, d4, d4\n" 3815 "vpadd.u32 d6, d6, d7\n" 3816 "vpadd.u32 d8, d8, d9\n" 3817 "vpadd.u32 d10, d10, d11\n" 3818 "vpadd.u32 d6, d6, d8\n" 3819 "vpadd.u32 d7, d10, d10\n" 3820 3821 // StaticQuantizationFloat::Transform 3822 "vadd.s32 q0, q0, q9\n" 3823 "vadd.s32 q3, q3, q6\n" 3824 "vadd.s32 q0, q0, q7\n" 3825 "vadd.s32 q3, q3, q7\n" 3826 "vcvt.f32.s32 q0, q0\n" 3827 "vcvt.f32.s32 q3, q3\n" 3828 "vmul.f32 q0, q0, q8\n" 3829 "vmul.f32 q3, q3, q8\n" 3830 3831 // RowMajorOutput::Output 3832 "vst1.32 {d0}, [%[result]]!\n" 3833 "vst1.32 {d1[0]}, [%[result]]!\n" 3834 "vst1.32 {d6}, [r0]!\n" 3835 "vst1.32 {d7[0]}, [r0]!\n" 3836 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3837 : [count] "r"(params.kernel.count), 3838 [stride] "r"(params.output_stream.stride), 3839 [scale] "r"(params.kernel.scale) 3840 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3841 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 3842 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc", 3843 "memory"); 3844 } 3845 3846 template <> 3847 inline void MulKernel< 3848 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3849 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3850 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3851 RowMajor>& params, 3852 float* result) { 3853 #ifdef DEBUG 3854 #ifdef DEBUG_METAGEMM_VERBOSE 3855 std::cout << __FILE__ << "(" << __LINE__ 3856 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3857 "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, " 3858 "8>::Multiply()" 3859 << std::endl 3860 << std::flush; 3861 #endif 3862 #endif 3863 asm volatile( 3864 "pld [%[lhs]]\n" 3865 "pld [%[rhs]]\n" 3866 3867 // Clear aggregators. 3868 "vmov.i32 q0, #0\n" 3869 "vmov.i32 q1, #0\n" 3870 "vmov.i32 q2, #0\n" 3871 "vmov.i32 q3, q0\n" 3872 "vmov.i32 q4, q1\n" 3873 "vmov.i32 q5, q2\n" 3874 "vmov.i32 q6, q3\n" 3875 "vmov.i32 q7, q4\n" 3876 3877 // 2x4 lanes loop. 3878 "1:" 3879 3880 "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n" 3881 "vld1.8 {d16}, [%[lhs]:64]!\n" 3882 "vmull.u8 q11, d16, d18\n" 3883 "vld1.8 {d17}, [%[lhs]:64]!\n" 3884 "vmull.u8 q12, d16, d19\n" 3885 "pld [%[rhs], #64]\n" 3886 "vmull.u8 q13, d16, d20\n" 3887 "pld [%[lhs], #64]\n" 3888 "vmull.u8 q14, d16, d21\n" 3889 "vmull.u8 q15, d17, d18\n" 3890 "vpadal.u16 q0, q11\n" 3891 "vpadal.u16 q1, q12\n" 3892 "vpadal.u16 q2, q13\n" 3893 "vmull.u8 q11, d17, d19\n" 3894 "vmull.u8 q12, d17, d20\n" 3895 "vmull.u8 q13, d17, d21\n" 3896 3897 // Subtract counter. 3898 "subs %[count], %[count], #8\n" 3899 3900 "vpadal.u16 q3, q14\n" 3901 "vpadal.u16 q4, q15\n" 3902 "vpadal.u16 q5, q11\n" 3903 "vpadal.u16 q6, q12\n" 3904 "vpadal.u16 q7, q13\n" 3905 3906 // Loop break. 3907 "bgt 1b\n" 3908 3909 // StaticQuantizationFloat::Prepare 3910 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" 3911 "vld1.32 {d18, d19}, [%[rhs]:64]!\n" 3912 "vdup.32 q10, %[scale]\n" 3913 "vdup.32 q11, d16[0]\n" 3914 "vdup.32 q8, d16[1]\n" 3915 3916 // RowMajorOutput::Prepare 3917 "add r0, %[result], %[stride]\n" 3918 3919 // Reduce aggregators. 3920 "vpadd.u32 d0, d0, d1\n" 3921 "vpadd.u32 d2, d2, d3\n" 3922 "vpadd.u32 d4, d4, d5\n" 3923 "vpadd.u32 d6, d6, d7\n" 3924 "vpadd.u32 d0, d0, d2\n" 3925 "vpadd.u32 d1, d4, d6\n" 3926 "vpadd.u32 d8, d8, d9\n" 3927 "vpadd.u32 d10, d10, d11\n" 3928 "vpadd.u32 d12, d12, d13\n" 3929 "vpadd.u32 d14, d14, d15\n" 3930 "vpadd.u32 d8, d8, d10\n" 3931 "vpadd.u32 d9, d12, d14\n" 3932 3933 // StaticQuantizationFloat::Transform 3934 "vadd.s32 q0, q0, q11\n" 3935 "vadd.s32 q4, q4, q8\n" 3936 "vadd.s32 q0, q0, q9\n" 3937 "vadd.s32 q4, q4, q9\n" 3938 "vcvt.f32.s32 q0, q0\n" 3939 "vcvt.f32.s32 q4, q4\n" 3940 "vmul.f32 q0, q0, q10\n" 3941 "vmul.f32 q4, q4, q10\n" 3942 3943 // RowMajorOutput::Output 3944 "vst1.32 {d0, d1}, [%[result]]!\n" 3945 "vst1.32 {d8, d9}, [r0]!\n" 3946 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 3947 : [count] "r"(params.kernel.count), 3948 [stride] "r"(params.output_stream.stride), 3949 [scale] "r"(params.kernel.scale) 3950 : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", 3951 "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", 3952 "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", 3953 "d31", "cc", "memory"); 3954 } 3955 3956 template <> 3957 inline void MulKernel< 3958 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3959 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 3960 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 3961 RowMajor>& params, 3962 float* result) { 3963 #ifdef DEBUG 3964 #ifdef DEBUG_METAGEMM_VERBOSE 3965 std::cout << __FILE__ << "(" << __LINE__ 3966 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 3967 "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, " 3968 "8>::Multiply()" 3969 << std::endl 3970 << std::flush; 3971 #endif 3972 #endif 3973 asm volatile( 3974 "pld [%[lhs]]\n" 3975 "pld [%[rhs]]\n" 3976 3977 // Clear aggregators. 3978 "vmov.i32 q0, #0\n" 3979 "vmov.i32 q1, #0\n" 3980 "vmov.i32 q2, #0\n" 3981 3982 // General NxM lanes loop. 3983 "1:" 3984 3985 // Subtract counter. 3986 "subs %[count], %[count], #8\n" 3987 3988 "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n" 3989 "vld1.32 {d9}, [%[rhs]:64]!\n" 3990 "pld [%[lhs], #64]\n" 3991 "pld [%[rhs], #64]\n" 3992 "vmull.u8 q5, d9, d6\n" 3993 "vmull.u8 q6, d9, d7\n" 3994 "vmull.u8 q7, d9, d8\n" 3995 "vpadal.u16 q0, q5\n" 3996 "vpadal.u16 q1, q6\n" 3997 "vpadal.u16 q2, q7\n" 3998 3999 // Loop break. 4000 "bgt 1b\n" 4001 4002 // StaticQuantizationFloat::Prepare 4003 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" 4004 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" 4005 "vdup.32 q6, %[scale]\n" 4006 "vdup.32 q3, d8[0]\n" 4007 "vdup.32 q7, d8[1]\n" 4008 "vdup.32 q4, d9[0]\n" 4009 4010 // RowMajorOutput::Prepare 4011 "add r0, %[result], %[stride]\n" 4012 "add r1, r0, %[stride]\n" 4013 4014 // Reduce aggregators. 4015 "vpadd.u32 d0, d0, d1\n" 4016 "vpadd.u32 d0, d0, d0\n" 4017 "vpadd.u32 d2, d2, d3\n" 4018 "vpadd.u32 d2, d2, d2\n" 4019 "vpadd.u32 d4, d4, d5\n" 4020 "vpadd.u32 d4, d4, d4\n" 4021 4022 // StaticQuantizationFloat::Transform 4023 "vadd.s32 q0, q0, q3\n" 4024 "vadd.s32 q1, q1, q7\n" 4025 "vadd.s32 q2, q2, q4\n" 4026 "vadd.s32 q0, q0, q5\n" 4027 "vadd.s32 q1, q1, q5\n" 4028 "vadd.s32 q2, q2, q5\n" 4029 "vcvt.f32.s32 q0, q0\n" 4030 "vcvt.f32.s32 q1, q1\n" 4031 "vcvt.f32.s32 q2, q2\n" 4032 "vmul.f32 q0, q0, q6\n" 4033 "vmul.f32 q1, q1, q6\n" 4034 "vmul.f32 q2, q2, q6\n" 4035 4036 // RowMajorOutput::Output 4037 "vst1.32 {d0[0]}, [%[result]]!\n" 4038 "vst1.32 {d2[0]}, [r0]!\n" 4039 "vst1.32 {d4[0]}, [r1]!\n" 4040 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 4041 : [count] "r"(params.kernel.count), 4042 [stride] "r"(params.output_stream.stride), 4043 [scale] "r"(params.kernel.scale) 4044 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 4045 "d10", "d11", "d12", "d13", "d14", "d15", "cc", "memory"); 4046 } 4047 4048 template <> 4049 inline void MulKernel< 4050 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)4051 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 4052 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 4053 RowMajor>& params, 4054 float* result) { 4055 #ifdef DEBUG 4056 #ifdef DEBUG_METAGEMM_VERBOSE 4057 std::cout << __FILE__ << "(" << __LINE__ 4058 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 4059 "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, " 4060 "8>::Multiply()" 4061 << std::endl 4062 << std::flush; 4063 #endif 4064 #endif 4065 asm volatile( 4066 "pld [%[lhs]]\n" 4067 "pld [%[rhs]]\n" 4068 4069 // Clear aggregators. 4070 "vmov.i32 q0, #0\n" 4071 "vmov.i32 q1, #0\n" 4072 "vmov.i32 q2, #0\n" 4073 "vmov.i32 q3, q0\n" 4074 "vmov.i32 q4, q1\n" 4075 "vmov.i32 q5, q2\n" 4076 4077 // General NxM lanes loop. 4078 "1:" 4079 4080 // Subtract counter. 4081 "subs %[count], %[count], #8\n" 4082 4083 "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n" 4084 "vld1.32 {d15, d16}, [%[rhs]:64]!\n" 4085 "pld [%[lhs], #64]\n" 4086 "pld [%[rhs], #64]\n" 4087 "vmull.u8 q9, d15, d12\n" 4088 "vmull.u8 q10, d16, d12\n" 4089 "vmull.u8 q11, d15, d13\n" 4090 "vmull.u8 q12, d16, d13\n" 4091 "vmull.u8 q13, d15, d14\n" 4092 "vmull.u8 q14, d16, d14\n" 4093 "vpadal.u16 q0, q9\n" 4094 "vpadal.u16 q1, q10\n" 4095 "vpadal.u16 q2, q11\n" 4096 "vpadal.u16 q3, q12\n" 4097 "vpadal.u16 q4, q13\n" 4098 "vpadal.u16 q5, q14\n" 4099 4100 // Loop break. 4101 "bgt 1b\n" 4102 4103 // StaticQuantizationFloat::Prepare 4104 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" 4105 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" 4106 "vdup.32 q8, %[scale]\n" 4107 "vdup.32 q9, d12[0]\n" 4108 "vdup.32 q10, d12[1]\n" 4109 "vdup.32 q6, d13[0]\n" 4110 4111 // RowMajorOutput::Prepare 4112 "add r0, %[result], %[stride]\n" 4113 "add r1, r0, %[stride]\n" 4114 4115 // Reduce aggregators. 4116 "vpadd.u32 d0, d0, d1\n" 4117 "vpadd.u32 d2, d2, d3\n" 4118 "vpadd.u32 d0, d0, d2\n" 4119 "vpadd.u32 d4, d4, d5\n" 4120 "vpadd.u32 d6, d6, d7\n" 4121 "vpadd.u32 d4, d4, d6\n" 4122 "vpadd.u32 d8, d8, d9\n" 4123 "vpadd.u32 d10, d10, d11\n" 4124 "vpadd.u32 d8, d8, d10\n" 4125 4126 // StaticQuantizationFloat::Transform 4127 "vadd.s32 q0, q0, q9\n" 4128 "vadd.s32 q2, q2, q10\n" 4129 "vadd.s32 q4, q4, q6\n" 4130 "vadd.s32 q0, q0, q7\n" 4131 "vadd.s32 q2, q2, q7\n" 4132 "vadd.s32 q4, q4, q7\n" 4133 "vcvt.f32.s32 q0, q0\n" 4134 "vcvt.f32.s32 q2, q2\n" 4135 "vcvt.f32.s32 q4, q4\n" 4136 "vmul.f32 q0, q0, q8\n" 4137 "vmul.f32 q2, q2, q8\n" 4138 "vmul.f32 q4, q4, q8\n" 4139 4140 // RowMajorOutput::Output 4141 "vst1.32 {d0}, [%[result]]!\n" 4142 "vst1.32 {d4}, [r0]!\n" 4143 "vst1.32 {d8}, [r1]!\n" 4144 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 4145 : [count] "r"(params.kernel.count), 4146 [stride] "r"(params.output_stream.stride), 4147 [scale] "r"(params.kernel.scale) 4148 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 4149 "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", 4150 "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", 4151 "cc", "memory"); 4152 } 4153 4154 template <> 4155 inline void MulKernel< 4156 uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)4157 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, 4158 const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, 4159 RowMajor>& params, 4160 float* result) { 4161 #ifdef DEBUG 4162 #ifdef DEBUG_METAGEMM_VERBOSE 4163 std::cout << __FILE__ << "(" << __LINE__ 4164 << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " 4165 "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, " 4166 "8>::Multiply()" 4167 << std::endl 4168 << std::flush; 4169 #endif 4170 #endif 4171 asm volatile( 4172 "pld [%[lhs]]\n" 4173 "pld [%[rhs]]\n" 4174 4175 // Clear aggregators. 4176 "vmov.i32 q0, #0\n" 4177 "vmov.i32 q1, #0\n" 4178 "vmov.i32 q2, #0\n" 4179 "vmov.i32 q3, q0\n" 4180 "vmov.i32 q4, q1\n" 4181 "vmov.i32 q5, q2\n" 4182 "vmov.i32 q6, q3\n" 4183 "vmov.i32 q7, q4\n" 4184 "vmov.i32 q8, q5\n" 4185 4186 // 3x3 lanes loop. 4187 "1:" 4188 4189 "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n" 4190 "vld1.8 {d18}, [%[lhs]:64]!\n" 4191 "vmull.u8 q12, d18, d21\n" 4192 "vld1.8 {d19}, [%[lhs]:64]!\n" 4193 "vmull.u8 q13, d18, d22\n" 4194 "vld1.8 {d20}, [%[lhs]:64]!\n" 4195 "vmull.u8 q14, d18, d23\n" 4196 "pld [%[lhs], #64]\n" 4197 "vmull.u8 q15, d19, d21\n" 4198 "pld [%[rhs], #64]\n" 4199 "vpadal.u16 q0, q12\n" 4200 "vpadal.u16 q1, q13\n" 4201 "vpadal.u16 q2, q14\n" 4202 "vpadal.u16 q3, q15\n" 4203 "vmull.u8 q12, d19, d22\n" 4204 "vmull.u8 q13, d19, d23\n" 4205 "vmull.u8 q14, d20, d21\n" 4206 "vmull.u8 q15, d20, d22\n" 4207 4208 // Subtract counter. 4209 "subs %[count], %[count], #8\n" 4210 4211 "vmull.u8 q9, d20, d23\n" 4212 "vpadal.u16 q4, q12\n" 4213 "vpadal.u16 q5, q13\n" 4214 "vpadal.u16 q6, q14\n" 4215 "vpadal.u16 q7, q15\n" 4216 "vpadal.u16 q8, q9\n" 4217 4218 // Loop break. 4219 "bgt 1b\n" 4220 4221 // StaticQuantizationFloat::Prepare 4222 "vld1.32 {d18, d19}, [%[lhs]:64]!\n" 4223 "vld1.32 {d20, d21}, [%[rhs]:64]!\n" 4224 "vdup.32 q11, %[scale]\n" 4225 "vdup.32 q12, d18[0]\n" 4226 "vdup.32 q13, d18[1]\n" 4227 "vdup.32 q9, d19[0]\n" 4228 4229 // RowMajorOutput::Prepare 4230 "add r0, %[result], %[stride]\n" 4231 "add r1, r0, %[stride]\n" 4232 4233 // Reduce aggregators. 4234 "vpadd.u32 d0, d0, d1\n" 4235 "vpadd.u32 d2, d2, d3\n" 4236 "vpadd.u32 d4, d4, d5\n" 4237 "vpadd.u32 d0, d0, d2\n" 4238 "vpadd.u32 d1, d4, d4\n" 4239 "vpadd.u32 d6, d6, d7\n" 4240 "vpadd.u32 d8, d8, d9\n" 4241 "vpadd.u32 d10, d10, d11\n" 4242 "vpadd.u32 d6, d6, d8\n" 4243 "vpadd.u32 d7, d10, d10\n" 4244 "vpadd.u32 d12, d12, d13\n" 4245 "vpadd.u32 d14, d14, d15\n" 4246 "vpadd.u32 d16, d16, d17\n" 4247 "vpadd.u32 d12, d12, d14\n" 4248 "vpadd.u32 d13, d16, d16\n" 4249 4250 // StaticQuantizationFloat::Transform 4251 "vadd.s32 q0, q0, q12\n" 4252 "vadd.s32 q3, q3, q13\n" 4253 "vadd.s32 q6, q6, q9\n" 4254 "vadd.s32 q0, q0, q10\n" 4255 "vadd.s32 q3, q3, q10\n" 4256 "vadd.s32 q6, q6, q10\n" 4257 "vcvt.f32.s32 q0, q0\n" 4258 "vcvt.f32.s32 q3, q3\n" 4259 "vcvt.f32.s32 q6, q6\n" 4260 "vmul.f32 q0, q0, q11\n" 4261 "vmul.f32 q3, q3, q11\n" 4262 "vmul.f32 q6, q6, q11\n" 4263 4264 // RowMajorOutput::Output 4265 "vst1.32 {d0}, [%[result]]!\n" 4266 "vst1.32 {d1[0]}, [%[result]]!\n" 4267 "vst1.32 {d6}, [r0]!\n" 4268 "vst1.32 {d7[0]}, [r0]!\n" 4269 "vst1.32 {d12}, [r1]!\n" 4270 "vst1.32 {d13[0]}, [r1]!\n" 4271 : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) 4272 : [count] "r"(params.kernel.count), 4273 [stride] "r"(params.output_stream.stride), 4274 [scale] "r"(params.kernel.scale) 4275 : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", 4276 "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", 4277 "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", 4278 "d30", "d31", "cc", "memory"); 4279 } 4280 4281 } // namespace meta 4282 } // namespace gemmlowp 4283 4284 #else 4285 #warning "Meta gemm for arm32 requires: GEMMLOWP_NEON_32!" 4286 #endif 4287 4288 #endif // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_ 4289