1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H 11 #define EIGEN_GENERAL_MATRIX_VECTOR_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 /* Optimized col-major matrix * vector product: 18 * This algorithm processes 4 columns at onces that allows to both reduce 19 * the number of load/stores of the result by a factor 4 and to reduce 20 * the instruction dependency. Moreover, we know that all bands have the 21 * same alignment pattern. 22 * 23 * Mixing type logic: C += alpha * A * B 24 * | A | B |alpha| comments 25 * |real |cplx |cplx | no vectorization 26 * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization 27 * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp 28 * |cplx |real |real | optimal case, vectorization possible via real-cplx mul 29 */ 30 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 31 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> 32 { 33 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 34 35 enum { 36 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable 37 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), 38 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 39 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 40 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 41 }; 42 43 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 44 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 45 typedef typename packet_traits<ResScalar>::type _ResPacket; 46 47 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 48 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 49 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 50 51 EIGEN_DONT_INLINE static void run( 52 Index rows, Index cols, 53 const LhsScalar* lhs, Index lhsStride, 54 const RhsScalar* rhs, Index rhsIncr, 55 ResScalar* res, Index resIncr, RhsScalar alpha); 56 }; 57 58 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 59 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( 60 Index rows, Index cols, 61 const LhsScalar* lhs, Index lhsStride, 62 const RhsScalar* rhs, Index rhsIncr, 63 ResScalar* res, Index resIncr, RhsScalar alpha) 64 { 65 EIGEN_UNUSED_VARIABLE(resIncr) 66 eigen_internal_assert(resIncr==1); 67 #ifdef _EIGEN_ACCUMULATE_PACKETS 68 #error _EIGEN_ACCUMULATE_PACKETS has already been defined 69 #endif 70 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ 71 pstore(&res[j], \ 72 padd(pload<ResPacket>(&res[j]), \ 73 padd( \ 74 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ 75 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ 76 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ 77 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) 78 79 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 80 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 81 if(ConjugateRhs) 82 alpha = numext::conj(alpha); 83 84 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; 85 const Index columnsAtOnce = 4; 86 const Index peels = 2; 87 const Index LhsPacketAlignedMask = LhsPacketSize-1; 88 const Index ResPacketAlignedMask = ResPacketSize-1; 89 // const Index PeelAlignedMask = ResPacketSize*peels-1; 90 const Index size = rows; 91 92 // How many coeffs of the result do we have to skip to be aligned. 93 // Here we assume data are at least aligned on the base scalar type. 94 Index alignedStart = internal::first_aligned(res,size); 95 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; 96 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; 97 98 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; 99 Index alignmentPattern = alignmentStep==0 ? AllAligned 100 : alignmentStep==(LhsPacketSize/2) ? EvenAligned 101 : FirstAligned; 102 103 // we cannot assume the first element is aligned because of sub-matrices 104 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size); 105 106 // find how many columns do we have to skip to be aligned with the result (if possible) 107 Index skipColumns = 0; 108 // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) 109 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) 110 { 111 alignedSize = 0; 112 alignedStart = 0; 113 } 114 else if (LhsPacketSize>1) 115 { 116 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); 117 118 while (skipColumns<LhsPacketSize && 119 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) 120 ++skipColumns; 121 if (skipColumns==LhsPacketSize) 122 { 123 // nothing can be aligned, no need to skip any column 124 alignmentPattern = NoneAligned; 125 skipColumns = 0; 126 } 127 else 128 { 129 skipColumns = (std::min)(skipColumns,cols); 130 // note that the skiped columns are processed later. 131 } 132 133 eigen_internal_assert( (alignmentPattern==NoneAligned) 134 || (skipColumns + columnsAtOnce >= cols) 135 || LhsPacketSize > size 136 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); 137 } 138 else if(Vectorizable) 139 { 140 alignedStart = 0; 141 alignedSize = size; 142 alignmentPattern = AllAligned; 143 } 144 145 Index offset1 = (FirstAligned && alignmentStep==1?3:1); 146 Index offset3 = (FirstAligned && alignmentStep==1?1:3); 147 148 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; 149 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) 150 { 151 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), 152 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), 153 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), 154 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); 155 156 // this helps a lot generating better binary code 157 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, 158 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; 159 160 if (Vectorizable) 161 { 162 /* explicit vectorization */ 163 // process initial unaligned coeffs 164 for (Index j=0; j<alignedStart; ++j) 165 { 166 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); 167 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); 168 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); 169 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); 170 } 171 172 if (alignedSize>alignedStart) 173 { 174 switch(alignmentPattern) 175 { 176 case AllAligned: 177 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 178 _EIGEN_ACCUMULATE_PACKETS(d,d,d); 179 break; 180 case EvenAligned: 181 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 182 _EIGEN_ACCUMULATE_PACKETS(d,du,d); 183 break; 184 case FirstAligned: 185 { 186 Index j = alignedStart; 187 if(peels>1) 188 { 189 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; 190 ResPacket T0, T1; 191 192 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); 193 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); 194 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); 195 196 for (; j<peeledSize; j+=peels*ResPacketSize) 197 { 198 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); 199 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); 200 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); 201 202 A00 = pload<LhsPacket>(&lhs0[j]); 203 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]); 204 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); 205 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); 206 207 T0 = pcj.pmadd(A01, ptmp1, T0); 208 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); 209 T0 = pcj.pmadd(A02, ptmp2, T0); 210 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); 211 T0 = pcj.pmadd(A03, ptmp3, T0); 212 pstore(&res[j],T0); 213 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); 214 T1 = pcj.pmadd(A11, ptmp1, T1); 215 T1 = pcj.pmadd(A12, ptmp2, T1); 216 T1 = pcj.pmadd(A13, ptmp3, T1); 217 pstore(&res[j+ResPacketSize],T1); 218 } 219 } 220 for (; j<alignedSize; j+=ResPacketSize) 221 _EIGEN_ACCUMULATE_PACKETS(d,du,du); 222 break; 223 } 224 default: 225 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 226 _EIGEN_ACCUMULATE_PACKETS(du,du,du); 227 break; 228 } 229 } 230 } // end explicit vectorization 231 232 /* process remaining coeffs (or all if there is no explicit vectorization) */ 233 for (Index j=alignedSize; j<size; ++j) 234 { 235 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); 236 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); 237 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); 238 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); 239 } 240 } 241 242 // process remaining first and last columns (at most columnsAtOnce-1) 243 Index end = cols; 244 Index start = columnBound; 245 do 246 { 247 for (Index k=start; k<end; ++k) 248 { 249 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); 250 const LhsScalar* lhs0 = lhs + k*lhsStride; 251 252 if (Vectorizable) 253 { 254 /* explicit vectorization */ 255 // process first unaligned result's coeffs 256 for (Index j=0; j<alignedStart; ++j) 257 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0)); 258 // process aligned result's coeffs 259 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) 260 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) 261 pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); 262 else 263 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) 264 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); 265 } 266 267 // process remaining scalars (or all if no explicit vectorization) 268 for (Index i=alignedSize; i<size; ++i) 269 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0)); 270 } 271 if (skipColumns) 272 { 273 start = 0; 274 end = skipColumns; 275 skipColumns = 0; 276 } 277 else 278 break; 279 } while(Vectorizable); 280 #undef _EIGEN_ACCUMULATE_PACKETS 281 } 282 283 /* Optimized row-major matrix * vector product: 284 * This algorithm processes 4 rows at onces that allows to both reduce 285 * the number of load/stores of the result by a factor 4 and to reduce 286 * the instruction dependency. Moreover, we know that all bands have the 287 * same alignment pattern. 288 * 289 * Mixing type logic: 290 * - alpha is always a complex (or converted to a complex) 291 * - no vectorization 292 */ 293 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 294 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> 295 { 296 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 297 298 enum { 299 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable 300 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), 301 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 302 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 303 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 304 }; 305 306 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 307 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 308 typedef typename packet_traits<ResScalar>::type _ResPacket; 309 310 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 311 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 312 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 313 314 EIGEN_DONT_INLINE static void run( 315 Index rows, Index cols, 316 const LhsScalar* lhs, Index lhsStride, 317 const RhsScalar* rhs, Index rhsIncr, 318 ResScalar* res, Index resIncr, 319 ResScalar alpha); 320 }; 321 322 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 323 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( 324 Index rows, Index cols, 325 const LhsScalar* lhs, Index lhsStride, 326 const RhsScalar* rhs, Index rhsIncr, 327 ResScalar* res, Index resIncr, 328 ResScalar alpha) 329 { 330 EIGEN_UNUSED_VARIABLE(rhsIncr); 331 eigen_internal_assert(rhsIncr==1); 332 #ifdef _EIGEN_ACCUMULATE_PACKETS 333 #error _EIGEN_ACCUMULATE_PACKETS has already been defined 334 #endif 335 336 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ 337 RhsPacket b = pload<RhsPacket>(&rhs[j]); \ 338 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ 339 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ 340 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ 341 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } 342 343 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 344 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 345 346 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; 347 const Index rowsAtOnce = 4; 348 const Index peels = 2; 349 const Index RhsPacketAlignedMask = RhsPacketSize-1; 350 const Index LhsPacketAlignedMask = LhsPacketSize-1; 351 // const Index PeelAlignedMask = RhsPacketSize*peels-1; 352 const Index depth = cols; 353 354 // How many coeffs of the result do we have to skip to be aligned. 355 // Here we assume data are at least aligned on the base scalar type 356 // if that's not the case then vectorization is discarded, see below. 357 Index alignedStart = internal::first_aligned(rhs, depth); 358 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; 359 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; 360 361 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; 362 Index alignmentPattern = alignmentStep==0 ? AllAligned 363 : alignmentStep==(LhsPacketSize/2) ? EvenAligned 364 : FirstAligned; 365 366 // we cannot assume the first element is aligned because of sub-matrices 367 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); 368 369 // find how many rows do we have to skip to be aligned with rhs (if possible) 370 Index skipRows = 0; 371 // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) 372 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) 373 { 374 alignedSize = 0; 375 alignedStart = 0; 376 } 377 else if (LhsPacketSize>1) 378 { 379 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); 380 381 while (skipRows<LhsPacketSize && 382 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) 383 ++skipRows; 384 if (skipRows==LhsPacketSize) 385 { 386 // nothing can be aligned, no need to skip any column 387 alignmentPattern = NoneAligned; 388 skipRows = 0; 389 } 390 else 391 { 392 skipRows = (std::min)(skipRows,Index(rows)); 393 // note that the skiped columns are processed later. 394 } 395 eigen_internal_assert( alignmentPattern==NoneAligned 396 || LhsPacketSize==1 397 || (skipRows + rowsAtOnce >= rows) 398 || LhsPacketSize > depth 399 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); 400 } 401 else if(Vectorizable) 402 { 403 alignedStart = 0; 404 alignedSize = depth; 405 alignmentPattern = AllAligned; 406 } 407 408 Index offset1 = (FirstAligned && alignmentStep==1?3:1); 409 Index offset3 = (FirstAligned && alignmentStep==1?1:3); 410 411 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; 412 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) 413 { 414 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); 415 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); 416 417 // this helps the compiler generating good binary code 418 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, 419 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; 420 421 if (Vectorizable) 422 { 423 /* explicit vectorization */ 424 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)), 425 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0)); 426 427 // process initial unaligned coeffs 428 // FIXME this loop get vectorized by the compiler ! 429 for (Index j=0; j<alignedStart; ++j) 430 { 431 RhsScalar b = rhs[j]; 432 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); 433 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); 434 } 435 436 if (alignedSize>alignedStart) 437 { 438 switch(alignmentPattern) 439 { 440 case AllAligned: 441 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 442 _EIGEN_ACCUMULATE_PACKETS(d,d,d); 443 break; 444 case EvenAligned: 445 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 446 _EIGEN_ACCUMULATE_PACKETS(d,du,d); 447 break; 448 case FirstAligned: 449 { 450 Index j = alignedStart; 451 if (peels>1) 452 { 453 /* Here we proccess 4 rows with with two peeled iterations to hide 454 * the overhead of unaligned loads. Moreover unaligned loads are handled 455 * using special shift/move operations between the two aligned packets 456 * overlaping the desired unaligned packet. This is *much* more efficient 457 * than basic unaligned loads. 458 */ 459 LhsPacket A01, A02, A03, A11, A12, A13; 460 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); 461 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); 462 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); 463 464 for (; j<peeledSize; j+=peels*RhsPacketSize) 465 { 466 RhsPacket b = pload<RhsPacket>(&rhs[j]); 467 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); 468 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); 469 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); 470 471 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0); 472 ptmp1 = pcj.pmadd(A01, b, ptmp1); 473 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); 474 ptmp2 = pcj.pmadd(A02, b, ptmp2); 475 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); 476 ptmp3 = pcj.pmadd(A03, b, ptmp3); 477 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); 478 479 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]); 480 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); 481 ptmp1 = pcj.pmadd(A11, b, ptmp1); 482 ptmp2 = pcj.pmadd(A12, b, ptmp2); 483 ptmp3 = pcj.pmadd(A13, b, ptmp3); 484 } 485 } 486 for (; j<alignedSize; j+=RhsPacketSize) 487 _EIGEN_ACCUMULATE_PACKETS(d,du,du); 488 break; 489 } 490 default: 491 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 492 _EIGEN_ACCUMULATE_PACKETS(du,du,du); 493 break; 494 } 495 tmp0 += predux(ptmp0); 496 tmp1 += predux(ptmp1); 497 tmp2 += predux(ptmp2); 498 tmp3 += predux(ptmp3); 499 } 500 } // end explicit vectorization 501 502 // process remaining coeffs (or all if no explicit vectorization) 503 // FIXME this loop get vectorized by the compiler ! 504 for (Index j=alignedSize; j<depth; ++j) 505 { 506 RhsScalar b = rhs[j]; 507 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); 508 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); 509 } 510 res[i*resIncr] += alpha*tmp0; 511 res[(i+offset1)*resIncr] += alpha*tmp1; 512 res[(i+2)*resIncr] += alpha*tmp2; 513 res[(i+offset3)*resIncr] += alpha*tmp3; 514 } 515 516 // process remaining first and last rows (at most columnsAtOnce-1) 517 Index end = rows; 518 Index start = rowBound; 519 do 520 { 521 for (Index i=start; i<end; ++i) 522 { 523 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); 524 ResPacket ptmp0 = pset1<ResPacket>(tmp0); 525 const LhsScalar* lhs0 = lhs + i*lhsStride; 526 // process first unaligned result's coeffs 527 // FIXME this loop get vectorized by the compiler ! 528 for (Index j=0; j<alignedStart; ++j) 529 tmp0 += cj.pmul(lhs0[j], rhs[j]); 530 531 if (alignedSize>alignedStart) 532 { 533 // process aligned rhs coeffs 534 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) 535 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) 536 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); 537 else 538 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) 539 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); 540 tmp0 += predux(ptmp0); 541 } 542 543 // process remaining scalars 544 // FIXME this loop get vectorized by the compiler ! 545 for (Index j=alignedSize; j<depth; ++j) 546 tmp0 += cj.pmul(lhs0[j], rhs[j]); 547 res[i*resIncr] += alpha*tmp0; 548 } 549 if (skipRows) 550 { 551 start = 0; 552 end = skipRows; 553 skipRows = 0; 554 } 555 else 556 break; 557 } while(Vectorizable); 558 559 #undef _EIGEN_ACCUMULATE_PACKETS 560 } 561 562 } // end namespace internal 563 564 } // end namespace Eigen 565 566 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H 567