1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // kernel_SSE.h: a collection of Intel SSE optimized kernels. 16 // Check in kernel_default.h which one(s) are actually used by default. 17 // Others are mere experiments; they are still covered by tests 18 // in case they might be useful some day. 19 // 20 21 #ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 22 #define GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 23 24 #include "kernel.h" 25 26 #include <string.h> 27 #include <cassert> 28 29 namespace gemmlowp { 30 31 #ifdef GEMMLOWP_SSE4_32 32 struct SSE4_32_Kernel4x4Depth2 : KernelBase { 33 typedef KernelFormat< 34 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>, 35 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > 36 Format; 37 NameSSE4_32_Kernel4x4Depth238 const char* Name() const override { return "SSE, 4x4, depth 2"; } 39 RunSSE4_32_Kernel4x4Depth240 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 41 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 42 const std::uint8_t* rhs_ptr, std::size_t start_depth, 43 std::size_t run_depth) const override { 44 ScopedProfilingLabel label("optimized kernel"); 45 assert(dst_row_stride == 1); 46 std::int32_t run_depth_cells = run_depth / Format::kDepth; 47 /* Main loop */ 48 49 // A 2x4 cell of Rhs is stored in 16bit in xmm1 . 50 // A 4x2 block Lhs is stored in 16bit in xmm0. 51 // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7. 52 // 53 // +-------+-------+-------+-------+ 54 // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]| 55 // Rhs +-------+---------------+-------+ 56 // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]| 57 // +-------+-------+-------+-------+ 58 // 59 // | | | | | 60 // 61 // Lhs | | | | | 62 // 63 // +--+--+ - - - - +-------+-------+-------+-------+ 64 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 65 // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 | 66 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 67 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 68 // +--+--+ - - - - +-------+-------+-------+-------+ 69 // 70 // Accumulator 71 72 asm volatile( 73 74 // set accumulators to zero. 75 "pxor %%xmm4 , %%xmm4 \n\t" 76 "pxor %%xmm5 , %%xmm5 \n\t" 77 "pxor %%xmm6 , %%xmm6 \n\t" 78 "pxor %%xmm7 , %%xmm7 \n\t" 79 80 "movl %[run_depth_cells], %%eax\n\t" 81 "subl $2, %%eax\n\t" 82 "js outerLoop1%=\n\t" 83 84 // Loop for K unrolled by 4 85 "outerLoop2%=:\n\t" 86 87 // K = 1,2 88 // RHS cell to xmm1 89 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 90 91 // LHS cell 92 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 93 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 94 "pmaddwd %%xmm0, %%xmm2 \n\t" 95 "paddd %%xmm2, %%xmm4 \n\t" 96 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 97 "pmaddwd %%xmm0, %%xmm3 \n\t" 98 "paddd %%xmm3, %%xmm5 \n\t" 99 100 "prefetcht0 0x80(%[lhs_ptr]) \n\t" 101 102 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 103 "pmaddwd %%xmm0, %%xmm2 \n\t" 104 "paddd %%xmm2, %%xmm6 \n\t" 105 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 106 "pmaddwd %%xmm0, %%xmm3 \n\t" 107 "paddd %%xmm3, %%xmm7 \n\t" 108 109 "prefetcht0 0x80(%[rhs_ptr]) \n\t" 110 111 // K = 3,4 112 // RHS cell to xmm1 113 "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t" 114 115 // LHS cell 116 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 117 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 118 "pmaddwd %%xmm0, %%xmm2 \n\t" 119 "paddd %%xmm2, %%xmm4 \n\t" 120 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 121 "pmaddwd %%xmm0, %%xmm3 \n\t" 122 "paddd %%xmm3, %%xmm5 \n\t" 123 124 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 125 "pmaddwd %%xmm0, %%xmm2 \n\t" 126 "paddd %%xmm2, %%xmm6 \n\t" 127 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 128 "pmaddwd %%xmm0, %%xmm3 \n\t" 129 "paddd %%xmm3, %%xmm7 \n\t" 130 131 "addl $0x10, %[lhs_ptr]\n\t" 132 "addl $0x10, %[rhs_ptr]\n\t" 133 134 "subl $2, %[run_depth_cells]\n\t" 135 "jnz outerLoop2%=\n\t" 136 137 "movl %[run_depth_cells], %%eax\n\t" 138 "decl %%eax\n\t" 139 "js finish%=\n\t" 140 141 // Loop for K unrolled by 2 142 "outerLoop1%=:\n\t" 143 144 // RHS cell to xmm1 145 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 146 147 // LHS cell 148 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 149 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 150 "pmaddwd %%xmm0, %%xmm2 \n\t" 151 "paddd %%xmm2, %%xmm4 \n\t" 152 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 153 "pmaddwd %%xmm0, %%xmm3 \n\t" 154 "paddd %%xmm3, %%xmm5 \n\t" 155 156 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 157 "pmaddwd %%xmm0, %%xmm2 \n\t" 158 "paddd %%xmm2, %%xmm6 \n\t" 159 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 160 "pmaddwd %%xmm0, %%xmm3 \n\t" 161 "paddd %%xmm3, %%xmm7 \n\t" 162 163 "addl $0x08, %[lhs_ptr]\n\t" 164 "addl $0x08, %[rhs_ptr]\n\t" 165 166 "decl %[run_depth_cells]\n\t" 167 "jnz outerLoop1%=\n\t" 168 169 "finish%=:\n\t" 170 171 "movl %[dst_col_stride], %%eax\n\t" 172 "shll $2, %%eax\n\t" 173 174 "movl %[start_depth], %%ecx\n\t" 175 "test %%ecx, %%ecx\n\t" 176 "jz storeDst%=\n\t" 177 178 "leal (%%eax,%%eax,0x2), %%ecx\n\t" 179 "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t" 180 "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t" 181 "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t" 182 "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t" 183 184 "storeDst%=:\n\t" 185 186 "leal (%%eax,%%eax,0x2), %%ecx\n\t" 187 "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t" 188 "movdqu %%xmm5 , 0x00(%[dst_ptr], %%eax, 1)\n\t" 189 "movdqu %%xmm6 , 0x00(%[dst_ptr], %%eax, 2)\n\t" 190 "movdqu %%xmm7 , 0x00(%[dst_ptr], %%ecx, 1)\n\t" 191 192 : // outputs 193 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 194 [dst_ptr] "+r"(dst_ptr) 195 : // inputs 196 [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride), 197 [run_depth_cells] "g"(run_depth_cells) 198 : // clobbers 199 "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5", 200 "%xmm6", "%xmm7", "%eax", "%ecx"); 201 } 202 }; 203 #endif 204 #ifdef GEMMLOWP_SSE4_64 205 struct SSE4_64_Kernel12x4Depth2 : KernelBase { 206 typedef KernelFormat< 207 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>, 208 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > 209 Format; 210 NameSSE4_64_Kernel12x4Depth2211 const char* Name() const override { return "SSE, 12x4, depth 2"; } 212 RunSSE4_64_Kernel12x4Depth2213 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 214 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 215 const std::uint8_t* rhs_ptr, std::size_t start_depth, 216 std::size_t run_depth) const override { 217 ScopedProfilingLabel label("optimized kernel"); 218 assert(dst_row_stride == 1); 219 const std::int64_t run_depth_cells = run_depth / Format::kDepth; 220 const std::int64_t dst_col_stride_q = dst_col_stride; 221 222 /* Main loop */ 223 224 // A 2x4 cell of Rhs is stored in 16bit in xmm1 . 225 // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced 226 // every Iteration. 227 // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15. 228 // 229 // +-------+-------+-------+-------+ 230 // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]| 231 // Rhs +-------+---------------+-------+ 232 // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]| 233 // +-------+-------+-------+-------+ 234 // 235 // | | | | | 236 // 237 // Lhs | | | | | 238 // 239 // +--+--+ - - - - +-------+-------+-------+-------+ 240 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 241 // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 | 242 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 243 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 244 // +--+--+ - - - - +-------+-------+-------+-------+ 245 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 246 // |xmm0 | (Iter2) | xmm8 | xmm9 | xmm10 | xmm11 | 247 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 248 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 249 // +--+--+ - - - - +-------+-------+-------+-------+ 250 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 251 // |xmm0 | (Iter3) | xmm12 | xmm13 | xmm14 | xmm15 | 252 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 253 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 254 // +--+--+ - - - - +-------+-------+-------+-------+ 255 // 256 // Accumulator 257 258 asm volatile( 259 260 // Set registers for destination 261 "movq %[dst_col_stride_q], %%r12\n\t" 262 "shlq $2, %%r12\n\t" 263 "leaq (%%r12,%%r12,0x2), %%r13\n\t" 264 265 // Set accumulators to zero. 266 "pxor %%xmm4 , %%xmm4 \n\t" 267 "pxor %%xmm5 , %%xmm5 \n\t" 268 "pxor %%xmm6 , %%xmm6 \n\t" 269 "pxor %%xmm7 , %%xmm7 \n\t" 270 "pxor %%xmm8 , %%xmm8 \n\t" 271 "pxor %%xmm9 , %%xmm9 \n\t" 272 "pxor %%xmm10 , %%xmm10\n\t" 273 "pxor %%xmm11 , %%xmm11\n\t" 274 "pxor %%xmm12 , %%xmm12\n\t" 275 "pxor %%xmm13 , %%xmm13\n\t" 276 "pxor %%xmm14 , %%xmm14\n\t" 277 "pxor %%xmm15 , %%xmm15\n\t" 278 279 "movq %[run_depth_cells], %%r14\n\t" 280 "subq $2, %%r14\n\t" 281 "js outerLoop1%=\n\t" 282 283 // Loop for K unrolled by 4 284 "outerLoop2%=:\n\t" 285 286 // K = 1,2 287 // RHS cell to xmm1 288 289 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 290 291 // LHS cell 292 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 293 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 294 "pmaddwd %%xmm0, %%xmm2 \n\t" 295 "paddd %%xmm2, %%xmm4 \n\t" 296 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 297 "pmaddwd %%xmm0, %%xmm3 \n\t" 298 "paddd %%xmm3, %%xmm5 \n\t" 299 300 "prefetcht0 0x80(%[lhs_ptr]) \n\t" 301 302 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 303 "pmaddwd %%xmm0, %%xmm2 \n\t" 304 "paddd %%xmm2, %%xmm6 \n\t" 305 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 306 "pmaddwd %%xmm0, %%xmm3 \n\t" 307 "paddd %%xmm3, %%xmm7 \n\t" 308 309 // next LHS cell 310 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 311 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 312 "pmaddwd %%xmm0, %%xmm2 \n\t" 313 "paddd %%xmm2, %%xmm8 \n\t" 314 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 315 "pmaddwd %%xmm0, %%xmm3 \n\t" 316 "paddd %%xmm3, %%xmm9 \n\t" 317 318 "prefetcht0 0x80(%[rhs_ptr]) \n\t" 319 320 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 321 "pmaddwd %%xmm0, %%xmm2 \n\t" 322 "paddd %%xmm2, %%xmm10 \n\t" 323 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 324 "pmaddwd %%xmm0, %%xmm3 \n\t" 325 "paddd %%xmm3, %%xmm11 \n\t" 326 327 // next LHS cell 328 "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t" 329 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 330 "pmaddwd %%xmm0, %%xmm2 \n\t" 331 "paddd %%xmm2, %%xmm12 \n\t" 332 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 333 "pmaddwd %%xmm0, %%xmm3 \n\t" 334 "paddd %%xmm3, %%xmm13 \n\t" 335 336 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 337 "pmaddwd %%xmm0, %%xmm2 \n\t" 338 "paddd %%xmm2, %%xmm14 \n\t" 339 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 340 "pmaddwd %%xmm0, %%xmm3 \n\t" 341 "paddd %%xmm3, %%xmm15 \n\t" 342 343 // K = 3,4 344 // RHS cell to xmm1 345 "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t" 346 347 // LHS cell 348 "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t" 349 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 350 "pmaddwd %%xmm0, %%xmm2 \n\t" 351 "paddd %%xmm2, %%xmm4 \n\t" 352 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 353 "pmaddwd %%xmm0, %%xmm3 \n\t" 354 "paddd %%xmm3, %%xmm5 \n\t" 355 356 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 357 "pmaddwd %%xmm0, %%xmm2 \n\t" 358 "paddd %%xmm2, %%xmm6 \n\t" 359 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 360 "pmaddwd %%xmm0, %%xmm3 \n\t" 361 "paddd %%xmm3, %%xmm7 \n\t" 362 363 // next LHS cell 364 "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t" 365 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 366 "pmaddwd %%xmm0, %%xmm2 \n\t" 367 "paddd %%xmm2, %%xmm8 \n\t" 368 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 369 "pmaddwd %%xmm0, %%xmm3 \n\t" 370 "paddd %%xmm3, %%xmm9 \n\t" 371 372 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 373 "pmaddwd %%xmm0, %%xmm2 \n\t" 374 "paddd %%xmm2, %%xmm10 \n\t" 375 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 376 "pmaddwd %%xmm0, %%xmm3 \n\t" 377 "paddd %%xmm3, %%xmm11 \n\t" 378 379 // next LHS cell 380 "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t" 381 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 382 "pmaddwd %%xmm0, %%xmm2 \n\t" 383 "paddd %%xmm2, %%xmm12 \n\t" 384 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 385 "pmaddwd %%xmm0, %%xmm3 \n\t" 386 "paddd %%xmm3, %%xmm13 \n\t" 387 388 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 389 "pmaddwd %%xmm0, %%xmm2 \n\t" 390 "paddd %%xmm2, %%xmm14 \n\t" 391 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 392 "pmaddwd %%xmm0, %%xmm3 \n\t" 393 "paddd %%xmm3, %%xmm15 \n\t" 394 395 "addq $0x30, %[lhs_ptr]\n\t" 396 "addq $0x10, %[rhs_ptr]\n\t" 397 398 "subq $2, %[run_depth_cells]\n\t" 399 "jnz outerLoop2%=\n\t" 400 401 "movq %[run_depth_cells], %%r14\n\t" 402 "decq %%r14\n\t" 403 "js finish%=\n\t" 404 405 // Loop for K unrolled by 2 406 "outerLoop1%=:\n\t" 407 408 // RHS cell to xmm1 409 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 410 411 // LHS cell 412 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 413 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 414 "pmaddwd %%xmm0, %%xmm2 \n\t" 415 "paddd %%xmm2, %%xmm4 \n\t" 416 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 417 "pmaddwd %%xmm0, %%xmm3 \n\t" 418 "paddd %%xmm3, %%xmm5 \n\t" 419 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 420 "pmaddwd %%xmm0, %%xmm2 \n\t" 421 "paddd %%xmm2, %%xmm6 \n\t" 422 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 423 "pmaddwd %%xmm0, %%xmm3 \n\t" 424 "paddd %%xmm3, %%xmm7 \n\t" 425 426 // next LHS cell 427 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 428 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 429 "pmaddwd %%xmm0, %%xmm2 \n\t" 430 "paddd %%xmm2, %%xmm8 \n\t" 431 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 432 "pmaddwd %%xmm0, %%xmm3 \n\t" 433 "paddd %%xmm3, %%xmm9 \n\t" 434 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 435 "pmaddwd %%xmm0, %%xmm2 \n\t" 436 "paddd %%xmm2, %%xmm10 \n\t" 437 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 438 "pmaddwd %%xmm0, %%xmm3 \n\t" 439 "paddd %%xmm3, %%xmm11 \n\t" 440 441 // next LHS cell 442 "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t" 443 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 444 "pmaddwd %%xmm0, %%xmm2 \n\t" 445 "paddd %%xmm2, %%xmm12 \n\t" 446 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 447 "pmaddwd %%xmm0, %%xmm3 \n\t" 448 "paddd %%xmm3, %%xmm13 \n\t" 449 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 450 "pmaddwd %%xmm0, %%xmm2 \n\t" 451 "paddd %%xmm2, %%xmm14 \n\t" 452 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 453 "pmaddwd %%xmm0, %%xmm3 \n\t" 454 "paddd %%xmm3, %%xmm15 \n\t" 455 456 "addq $0x18, %[lhs_ptr]\n\t" 457 "addq $0x08, %[rhs_ptr]\n\t" 458 459 "decq %[run_depth_cells]\n\t" 460 "jnz outerLoop1%=\n\t" 461 462 "finish%=:\n\t" 463 464 "test %[start_depth], %[start_depth]\n\t" 465 "jz storeDst%=\n\t" 466 467 "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t" 468 "paddd 0x10(%[dst_ptr]) , %%xmm8 \n\t" 469 "paddd 0x20(%[dst_ptr]) , %%xmm12\n\t" 470 "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t" 471 "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t" 472 "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t" 473 "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t" 474 "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t" 475 "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t" 476 "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t" 477 "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t" 478 "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t" 479 480 "storeDst%=:\n\t" 481 482 "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t" 483 "movdqu %%xmm8 , 0x10(%[dst_ptr]) \n\t" 484 "movdqu %%xmm12 , 0x20(%[dst_ptr]) \n\t" 485 "movdqu %%xmm5 , 0x00(%[dst_ptr], %%r12, 1)\n\t" 486 "movdqu %%xmm9 , 0x10(%[dst_ptr], %%r12, 1)\n\t" 487 "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t" 488 "movdqu %%xmm6 , 0x00(%[dst_ptr], %%r12, 2)\n\t" 489 "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t" 490 "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t" 491 "movdqu %%xmm7 , 0x00(%[dst_ptr], %%r13, 1)\n\t" 492 "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t" 493 "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t" 494 495 : // outputs 496 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 497 [dst_ptr] "+r"(dst_ptr) 498 : // inputs 499 [start_depth] "r"(start_depth), 500 [dst_col_stride_q] "r"(dst_col_stride_q), 501 [run_depth_cells] "r"(run_depth_cells) 502 : // clobbers 503 "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5", 504 "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14", 505 "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"); 506 } 507 }; 508 #endif 509 510 } // namespace gemmlowp 511 512 #endif // GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 513