1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // kernel_SSE.h: a collection of Intel SSE optimized kernels. 16 // Check in kernel_default.h which one(s) are actually used by default. 17 // Others are mere experiments; they are still covered by tests 18 // in case they might be useful some day. 19 // 20 21 #ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 22 #define GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 23 24 #include "kernel.h" 25 26 #include <string.h> 27 #include <cassert> 28 29 namespace gemmlowp { 30 31 #ifdef GEMMLOWP_SSE4_32 32 struct SSE4_32_Kernel4x4Depth2 : KernelBase { 33 typedef KernelFormat< 34 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>, 35 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > 36 Format; 37 NameSSE4_32_Kernel4x4Depth238 const char* Name() const override { return "SSE, 4x4, depth 2"; } 39 RunSSE4_32_Kernel4x4Depth240 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 41 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 42 const std::uint8_t* rhs_ptr, std::size_t start_depth, 43 std::size_t run_depth) const override { 44 ScopedProfilingLabel label("optimized kernel"); 45 assert(dst_row_stride == 1); 46 std::int32_t run_depth_cells = run_depth / Format::kDepth; 47 /* Main loop */ 48 49 // A 2x4 cell of Rhs is stored in 16bit in xmm1 . 50 // A 4x2 block Lhs is stored in 16bit in xmm0. 51 // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7. 52 // 53 // +-------+-------+-------+-------+ 54 // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]| 55 // Rhs +-------+---------------+-------+ 56 // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]| 57 // +-------+-------+-------+-------+ 58 // 59 // | | | | | 60 // 61 // Lhs | | | | | 62 // 63 // +--+--+ - - - - +-------+-------+-------+-------+ 64 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 65 // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 | 66 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 67 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 68 // +--+--+ - - - - +-------+-------+-------+-------+ 69 // 70 // Accumulator 71 72 asm volatile( 73 74 // set accumulators to zero. 75 "pxor %%xmm4 , %%xmm4 \n\t" 76 "pxor %%xmm5 , %%xmm5 \n\t" 77 "pxor %%xmm6 , %%xmm6 \n\t" 78 "pxor %%xmm7 , %%xmm7 \n\t" 79 80 "movl %[run_depth_cells], %%eax\n\t" 81 "subl $2, %%eax\n\t" 82 "js outerLoop1%=\n\t" 83 84 // Loop for K unrolled by 4 85 "outerLoop2%=:\n\t" 86 87 // K = 1,2 88 // RHS cell to xmm1 89 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 90 91 // LHS cell 92 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 93 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 94 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 95 "pmaddwd %%xmm0, %%xmm2 \n\t" 96 "pmaddwd %%xmm0, %%xmm3 \n\t" 97 "paddd %%xmm2, %%xmm4 \n\t" 98 "paddd %%xmm3, %%xmm5 \n\t" 99 100 "prefetcht0 0x80(%[lhs_ptr]) \n\t" 101 102 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 103 "pmaddwd %%xmm0, %%xmm2 \n\t" 104 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 105 "pmaddwd %%xmm0, %%xmm3 \n\t" 106 107 "prefetcht0 0x80(%[rhs_ptr]) \n\t" 108 109 // K = 3,4 110 // RHS cell to xmm1 111 "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t" 112 113 "paddd %%xmm2, %%xmm6 \n\t" 114 "paddd %%xmm3, %%xmm7 \n\t" 115 116 // LHS cell 117 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 118 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 119 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 120 "pmaddwd %%xmm0, %%xmm2 \n\t" 121 "pmaddwd %%xmm0, %%xmm3 \n\t" 122 "paddd %%xmm2, %%xmm4 \n\t" 123 "paddd %%xmm3, %%xmm5 \n\t" 124 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 125 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 126 127 "addl $0x10, %[lhs_ptr] \n\t" 128 "addl $0x10, %[rhs_ptr] \n\t" 129 130 "pmaddwd %%xmm0, %%xmm3 \n\t" 131 "paddd %%xmm3, %%xmm7 \n\t" 132 "pmaddwd %%xmm0, %%xmm2 \n\t" 133 "paddd %%xmm2, %%xmm6 \n\t" 134 135 "subl $2, %[run_depth_cells]\n\t" 136 "ja outerLoop2%=\n\t" 137 138 "movl %[run_depth_cells], %%eax\n\t" 139 "decl %%eax\n\t" 140 "js finish%=\n\t" 141 142 // Loop for K unrolled by 2 143 "outerLoop1%=:\n\t" 144 145 // RHS cell to xmm1 146 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 147 148 // LHS cell 149 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 150 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 151 "pmaddwd %%xmm0, %%xmm2 \n\t" 152 "paddd %%xmm2, %%xmm4 \n\t" 153 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 154 "pmaddwd %%xmm0, %%xmm3 \n\t" 155 "paddd %%xmm3, %%xmm5 \n\t" 156 157 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 158 "pmaddwd %%xmm0, %%xmm2 \n\t" 159 "paddd %%xmm2, %%xmm6 \n\t" 160 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 161 "pmaddwd %%xmm0, %%xmm3 \n\t" 162 "paddd %%xmm3, %%xmm7 \n\t" 163 164 "addl $0x08, %[lhs_ptr]\n\t" 165 "addl $0x08, %[rhs_ptr]\n\t" 166 167 "decl %[run_depth_cells]\n\t" 168 "jnz outerLoop1%=\n\t" 169 170 "finish%=:\n\t" 171 172 "movl %[dst_col_stride], %%eax\n\t" 173 "shll $2, %%eax\n\t" 174 175 "movl %[start_depth], %%ecx\n\t" 176 "test %%ecx, %%ecx\n\t" 177 "jz storeDst%=\n\t" 178 179 "leal (%%eax,%%eax,0x2), %%ecx\n\t" 180 "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t" 181 "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t" 182 "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t" 183 "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t" 184 185 "storeDst%=:\n\t" 186 187 "leal (%%eax,%%eax,0x2), %%ecx\n\t" 188 "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t" 189 "movdqu %%xmm5 , 0x00(%[dst_ptr], %%eax, 1)\n\t" 190 "movdqu %%xmm6 , 0x00(%[dst_ptr], %%eax, 2)\n\t" 191 "movdqu %%xmm7 , 0x00(%[dst_ptr], %%ecx, 1)\n\t" 192 193 : // outputs 194 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 195 [dst_ptr] "+r"(dst_ptr) 196 : // inputs 197 [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride), 198 [run_depth_cells] "g"(run_depth_cells) 199 : // clobbers 200 "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5", 201 "%xmm6", "%xmm7", "%eax", "%ecx"); 202 } 203 }; 204 #endif 205 #ifdef GEMMLOWP_SSE4_64 206 struct SSE4_64_Kernel12x4Depth2 : KernelBase { 207 typedef KernelFormat< 208 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>, 209 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > 210 Format; 211 NameSSE4_64_Kernel12x4Depth2212 const char* Name() const override { return "SSE, 12x4, depth 2"; } 213 RunSSE4_64_Kernel12x4Depth2214 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 215 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 216 const std::uint8_t* rhs_ptr, std::size_t start_depth, 217 std::size_t run_depth) const override { 218 ScopedProfilingLabel label("optimized kernel"); 219 assert(dst_row_stride == 1); 220 const std::int64_t run_depth_cells = run_depth / Format::kDepth; 221 const std::int64_t dst_col_stride_q = dst_col_stride; 222 223 /* Main loop */ 224 225 // A 2x4 cell of Rhs is stored in 16bit in xmm1 . 226 // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced 227 // every Iteration. 228 // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15. 229 // 230 // +-------+-------+-------+-------+ 231 // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]| 232 // Rhs +-------+---------------+-------+ 233 // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]| 234 // +-------+-------+-------+-------+ 235 // 236 // | | | | | 237 // 238 // Lhs | | | | | 239 // 240 // +--+--+ - - - - +-------+-------+-------+-------+ 241 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 242 // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 | 243 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 244 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | 245 // +--+--+ - - - - +-------+-------+-------+-------+ 246 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 247 // |xmm0 | (Iter2) | xmm8 | xmm9 | xmm10 | xmm11 | 248 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 249 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | 250 // +--+--+ - - - - +-------+-------+-------+-------+ 251 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 252 // |xmm0 | (Iter3) | xmm12 | xmm13 | xmm14 | xmm15 | 253 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 254 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | 255 // +--+--+ - - - - +-------+-------+-------+-------+ 256 // 257 // Accumulator 258 259 asm volatile( 260 261 // Set registers for destination 262 "movq %[dst_col_stride_q], %%r12\n\t" 263 "shlq $2, %%r12\n\t" 264 "leaq (%%r12,%%r12,0x2), %%r13\n\t" 265 266 // Set accumulators to zero. 267 "pxor %%xmm4 , %%xmm4 \n\t" 268 "pxor %%xmm5 , %%xmm5 \n\t" 269 "pxor %%xmm6 , %%xmm6 \n\t" 270 "pxor %%xmm7 , %%xmm7 \n\t" 271 "pxor %%xmm8 , %%xmm8 \n\t" 272 "pxor %%xmm9 , %%xmm9 \n\t" 273 "pxor %%xmm10 , %%xmm10\n\t" 274 "pxor %%xmm11 , %%xmm11\n\t" 275 "pxor %%xmm12 , %%xmm12\n\t" 276 "pxor %%xmm13 , %%xmm13\n\t" 277 "pxor %%xmm14 , %%xmm14\n\t" 278 "pxor %%xmm15 , %%xmm15\n\t" 279 280 "movq %[run_depth_cells], %%r14\n\t" 281 "subq $2, %%r14\n\t" 282 "js outerLoop1%=\n\t" 283 284 // Loop for K unrolled by 4 285 "outerLoop2%=:\n\t" 286 287 // K = 1,2 288 // RHS cell to xmm1 289 290 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 291 292 // LHS cell 293 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 294 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 295 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 296 "pmaddwd %%xmm0, %%xmm2 \n\t" 297 "pmaddwd %%xmm0, %%xmm3 \n\t" 298 "paddd %%xmm2, %%xmm4 \n\t" 299 "paddd %%xmm3, %%xmm5 \n\t" 300 301 "prefetcht0 0x80(%[lhs_ptr]) \n\t" 302 303 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 304 "pmaddwd %%xmm0, %%xmm2 \n\t" 305 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 306 "pmaddwd %%xmm0, %%xmm3 \n\t" 307 308 // next LHS cell 309 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 310 311 "paddd %%xmm2, %%xmm6 \n\t" 312 "paddd %%xmm3, %%xmm7 \n\t" 313 314 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 315 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 316 "pmaddwd %%xmm0, %%xmm2 \n\t" 317 "pmaddwd %%xmm0, %%xmm3 \n\t" 318 "paddd %%xmm2, %%xmm8 \n\t" 319 "paddd %%xmm3, %%xmm9 \n\t" 320 321 "prefetcht0 0x80(%[rhs_ptr]) \n\t" 322 323 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 324 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 325 "pmaddwd %%xmm0, %%xmm2 \n\t" 326 "pmaddwd %%xmm0, %%xmm3 \n\t" 327 "paddd %%xmm2, %%xmm10 \n\t" 328 "paddd %%xmm3, %%xmm11 \n\t" 329 330 // next LHS cell 331 "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t" 332 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 333 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 334 "pmaddwd %%xmm0, %%xmm2 \n\t" 335 "pmaddwd %%xmm0, %%xmm3 \n\t" 336 "paddd %%xmm2, %%xmm12 \n\t" 337 "paddd %%xmm3, %%xmm13 \n\t" 338 339 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 340 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 341 "pmaddwd %%xmm0, %%xmm2 \n\t" 342 "pmaddwd %%xmm0, %%xmm3 \n\t" 343 "paddd %%xmm2, %%xmm14 \n\t" 344 "paddd %%xmm3, %%xmm15 \n\t" 345 346 // K = 3,4 347 // RHS cell to xmm1 348 "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t" 349 350 // LHS cell 351 "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t" 352 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 353 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 354 "pmaddwd %%xmm0, %%xmm2 \n\t" 355 "pmaddwd %%xmm0, %%xmm3 \n\t" 356 "paddd %%xmm2, %%xmm4 \n\t" 357 "paddd %%xmm3, %%xmm5 \n\t" 358 359 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 360 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 361 "pmaddwd %%xmm0, %%xmm2 \n\t" 362 "pmaddwd %%xmm0, %%xmm3 \n\t" 363 "paddd %%xmm2, %%xmm6 \n\t" 364 "paddd %%xmm3, %%xmm7 \n\t" 365 366 // next LHS cell 367 "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t" 368 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 369 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 370 "pmaddwd %%xmm0, %%xmm2 \n\t" 371 "pmaddwd %%xmm0, %%xmm3 \n\t" 372 "paddd %%xmm2, %%xmm8 \n\t" 373 "paddd %%xmm3, %%xmm9 \n\t" 374 375 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 376 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 377 "pmaddwd %%xmm0, %%xmm2 \n\t" 378 "pmaddwd %%xmm0, %%xmm3 \n\t" 379 "paddd %%xmm2, %%xmm10 \n\t" 380 "paddd %%xmm3, %%xmm11 \n\t" 381 382 // next LHS cell 383 "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t" 384 385 "addq $0x30, %[lhs_ptr] \n\t" 386 "addq $0x10, %[rhs_ptr] \n\t" 387 388 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 389 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 390 "pmaddwd %%xmm0, %%xmm2 \n\t" 391 "pmaddwd %%xmm0, %%xmm3 \n\t" 392 "paddd %%xmm2, %%xmm12 \n\t" 393 "paddd %%xmm3, %%xmm13 \n\t" 394 395 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 396 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 397 "pmaddwd %%xmm0, %%xmm2 \n\t" 398 "pmaddwd %%xmm0, %%xmm3 \n\t" 399 "paddd %%xmm2, %%xmm14 \n\t" 400 "paddd %%xmm3, %%xmm15 \n\t" 401 402 "subq $2, %[run_depth_cells]\n\t" 403 "ja outerLoop2%=\n\t" 404 405 "movq %[run_depth_cells], %%r14\n\t" 406 "decq %%r14\n\t" 407 "js finish%=\n\t" 408 409 // Loop for K unrolled by 2 410 "outerLoop1%=:\n\t" 411 412 // RHS cell to xmm1 413 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" 414 415 // LHS cell 416 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" 417 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 418 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 419 "pmaddwd %%xmm0, %%xmm2 \n\t" 420 "pmaddwd %%xmm0, %%xmm3 \n\t" 421 "paddd %%xmm2, %%xmm4 \n\t" 422 "paddd %%xmm3, %%xmm5 \n\t" 423 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 424 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 425 "pmaddwd %%xmm0, %%xmm2 \n\t" 426 "pmaddwd %%xmm0, %%xmm3 \n\t" 427 "paddd %%xmm2, %%xmm6 \n\t" 428 "paddd %%xmm3, %%xmm7 \n\t" 429 430 // next LHS cell 431 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" 432 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 433 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 434 "pmaddwd %%xmm0, %%xmm2 \n\t" 435 "pmaddwd %%xmm0, %%xmm3 \n\t" 436 "paddd %%xmm2, %%xmm8 \n\t" 437 "paddd %%xmm3, %%xmm9 \n\t" 438 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 439 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 440 "pmaddwd %%xmm0, %%xmm2 \n\t" 441 "pmaddwd %%xmm0, %%xmm3 \n\t" 442 "paddd %%xmm2, %%xmm10 \n\t" 443 "paddd %%xmm3, %%xmm11 \n\t" 444 445 // next LHS cell 446 "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t" 447 448 "addq $0x18, %[lhs_ptr] \n\t" 449 "addq $0x08, %[rhs_ptr] \n\t" 450 451 "pshufd $0x00,%%xmm1,%%xmm2 \n\t" 452 "pshufd $0x55,%%xmm1,%%xmm3 \n\t" 453 "pmaddwd %%xmm0, %%xmm2 \n\t" 454 "pmaddwd %%xmm0, %%xmm3 \n\t" 455 "paddd %%xmm2, %%xmm12 \n\t" 456 "paddd %%xmm3, %%xmm13 \n\t" 457 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" 458 "pshufd $0xff,%%xmm1,%%xmm3 \n\t" 459 "pmaddwd %%xmm0, %%xmm2 \n\t" 460 "pmaddwd %%xmm0, %%xmm3 \n\t" 461 "paddd %%xmm2, %%xmm14 \n\t" 462 "paddd %%xmm3, %%xmm15 \n\t" 463 464 "decq %[run_depth_cells]\n\t" 465 "jnz outerLoop1%=\n\t" 466 467 "finish%=:\n\t" 468 469 "test %[start_depth], %[start_depth]\n\t" 470 "jz storeDst%=\n\t" 471 472 "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t" 473 "paddd 0x10(%[dst_ptr]) , %%xmm8 \n\t" 474 "paddd 0x20(%[dst_ptr]) , %%xmm12\n\t" 475 "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t" 476 "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t" 477 "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t" 478 "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t" 479 "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t" 480 "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t" 481 "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t" 482 "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t" 483 "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t" 484 485 "storeDst%=:\n\t" 486 487 "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t" 488 "movdqu %%xmm8 , 0x10(%[dst_ptr]) \n\t" 489 "movdqu %%xmm12 , 0x20(%[dst_ptr]) \n\t" 490 "movdqu %%xmm5 , 0x00(%[dst_ptr], %%r12, 1)\n\t" 491 "movdqu %%xmm9 , 0x10(%[dst_ptr], %%r12, 1)\n\t" 492 "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t" 493 "movdqu %%xmm6 , 0x00(%[dst_ptr], %%r12, 2)\n\t" 494 "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t" 495 "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t" 496 "movdqu %%xmm7 , 0x00(%[dst_ptr], %%r13, 1)\n\t" 497 "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t" 498 "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t" 499 500 : // outputs 501 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), 502 [dst_ptr] "+r"(dst_ptr) 503 : // inputs 504 [start_depth] "r"(start_depth), 505 [dst_col_stride_q] "r"(dst_col_stride_q), 506 [run_depth_cells] "r"(run_depth_cells) 507 : // clobbers 508 "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5", 509 "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14", 510 "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"); 511 } 512 }; 513 #endif 514 515 } // namespace gemmlowp 516 517 #endif // GEMMLOWP_INTERNAL_KERNEL_SSE_H_ 518