1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
5 // Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
6 // Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
7 //
8 // This Source Code Form is subject to the terms of the Mozilla
9 // Public License v. 2.0. If a copy of the MPL was not distributed
10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 
12 #ifndef EIGEN_ASSIGN_EVALUATOR_H
13 #define EIGEN_ASSIGN_EVALUATOR_H
14 
15 namespace Eigen {
16 
17 // This implementation is based on Assign.h
18 
19 namespace internal {
20 
21 /***************************************************************************
22 * Part 1 : the logic deciding a strategy for traversal and unrolling       *
23 ***************************************************************************/
24 
25 // copy_using_evaluator_traits is based on assign_traits
26 
27 template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
28 struct copy_using_evaluator_traits
29 {
30   typedef typename DstEvaluator::XprType Dst;
31   typedef typename Dst::Scalar DstScalar;
32 
33   enum {
34     DstFlags = DstEvaluator::Flags,
35     SrcFlags = SrcEvaluator::Flags
36   };
37 
38 public:
39   enum {
40     DstAlignment = DstEvaluator::Alignment,
41     SrcAlignment = SrcEvaluator::Alignment,
42     DstHasDirectAccess = DstFlags & DirectAccessBit,
43     JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
44   };
45 
46 private:
47   enum {
48     InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
49               : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
50               : int(Dst::RowsAtCompileTime),
51     InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
52               : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
53               : int(Dst::MaxRowsAtCompileTime),
54     OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
55     MaxSizeAtCompileTime = Dst::SizeAtCompileTime
56   };
57 
58   // TODO distinguish between linear traversal and inner-traversals
59   typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
60   typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;
61 
62   enum {
63     LinearPacketSize = unpacket_traits<LinearPacketType>::size,
64     InnerPacketSize = unpacket_traits<InnerPacketType>::size
65   };
66 
67 public:
68   enum {
69     LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment,
70     InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment
71   };
72 
73 private:
74   enum {
75     DstIsRowMajor = DstFlags&RowMajorBit,
76     SrcIsRowMajor = SrcFlags&RowMajorBit,
77     StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
78     MightVectorize = bool(StorageOrdersAgree)
79                   && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
80                   && bool(functor_traits<AssignFunc>::PacketAccess),
81     MayInnerVectorize  = MightVectorize
82                        && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0
83                        && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
84                        && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
85     MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
86     MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
87                        && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
88       /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
89          so it's only good for large enough sizes. */
90     MaySliceVectorize  = bool(MightVectorize) && bool(DstHasDirectAccess)
91                        && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize)))
92       /* slice vectorization can be slow, so we only want it if the slices are big, which is
93          indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
94          in a fixed-size matrix
95          However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
96   };
97 
98 public:
99   enum {
100     Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
101               : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
102               : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
103               : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
104               : int(MayLinearize)        ? int(LinearTraversal)
105                                          : int(DefaultTraversal),
106     Vectorized = int(Traversal) == InnerVectorizedTraversal
107               || int(Traversal) == LinearVectorizedTraversal
108               || int(Traversal) == SliceVectorizedTraversal
109   };
110 
111   typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType;
112 
113 private:
114   enum {
115     ActualPacketSize    = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize
116                         : Vectorized ? InnerPacketSize
117                         : 1,
118     UnrollingLimit      = EIGEN_UNROLLING_LIMIT * ActualPacketSize,
119     MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
120                        && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit),
121     MayUnrollInner      = int(InnerSize) != Dynamic
122                        && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit)
123   };
124 
125 public:
126   enum {
127     Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
128                 ? (
129                     int(MayUnrollCompletely) ? int(CompleteUnrolling)
130                   : int(MayUnrollInner)      ? int(InnerUnrolling)
131                                              : int(NoUnrolling)
132                   )
133               : int(Traversal) == int(LinearVectorizedTraversal)
134                 ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)))
135                           ? int(CompleteUnrolling)
136                           : int(NoUnrolling) )
137               : int(Traversal) == int(LinearTraversal)
138                 ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling)
139                                               : int(NoUnrolling) )
140 #if EIGEN_UNALIGNED_VECTORIZE
141               : int(Traversal) == int(SliceVectorizedTraversal)
142                 ? ( bool(MayUnrollInner) ? int(InnerUnrolling)
143                                          : int(NoUnrolling) )
144 #endif
145               : int(NoUnrolling)
146   };
147 
148 #ifdef EIGEN_DEBUG_ASSIGN
debugcopy_using_evaluator_traits149   static void debug()
150   {
151     std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
152     std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
153     std::cerr.setf(std::ios::hex, std::ios::basefield);
154     std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
155     std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
156     std::cerr.unsetf(std::ios::hex);
157     EIGEN_DEBUG_VAR(DstAlignment)
158     EIGEN_DEBUG_VAR(SrcAlignment)
159     EIGEN_DEBUG_VAR(LinearRequiredAlignment)
160     EIGEN_DEBUG_VAR(InnerRequiredAlignment)
161     EIGEN_DEBUG_VAR(JointAlignment)
162     EIGEN_DEBUG_VAR(InnerSize)
163     EIGEN_DEBUG_VAR(InnerMaxSize)
164     EIGEN_DEBUG_VAR(LinearPacketSize)
165     EIGEN_DEBUG_VAR(InnerPacketSize)
166     EIGEN_DEBUG_VAR(ActualPacketSize)
167     EIGEN_DEBUG_VAR(StorageOrdersAgree)
168     EIGEN_DEBUG_VAR(MightVectorize)
169     EIGEN_DEBUG_VAR(MayLinearize)
170     EIGEN_DEBUG_VAR(MayInnerVectorize)
171     EIGEN_DEBUG_VAR(MayLinearVectorize)
172     EIGEN_DEBUG_VAR(MaySliceVectorize)
173     std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
174     EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
175     EIGEN_DEBUG_VAR(UnrollingLimit)
176     EIGEN_DEBUG_VAR(MayUnrollCompletely)
177     EIGEN_DEBUG_VAR(MayUnrollInner)
178     std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
179     std::cerr << std::endl;
180   }
181 #endif
182 };
183 
184 /***************************************************************************
185 * Part 2 : meta-unrollers
186 ***************************************************************************/
187 
188 /************************
189 *** Default traversal ***
190 ************************/
191 
192 template<typename Kernel, int Index, int Stop>
193 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
194 {
195   // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
196   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
197   typedef typename DstEvaluatorType::XprType DstXprType;
198 
199   enum {
200     outer = Index / DstXprType::InnerSizeAtCompileTime,
201     inner = Index % DstXprType::InnerSizeAtCompileTime
202   };
203 
runcopy_using_evaluator_DefaultTraversal_CompleteUnrolling204   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
205   {
206     kernel.assignCoeffByOuterInner(outer, inner);
207     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
208   }
209 };
210 
211 template<typename Kernel, int Stop>
212 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
213 {
214   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
215 };
216 
217 template<typename Kernel, int Index_, int Stop>
218 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
219 {
220   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
221   {
222     kernel.assignCoeffByOuterInner(outer, Index_);
223     copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_+1, Stop>::run(kernel, outer);
224   }
225 };
226 
227 template<typename Kernel, int Stop>
228 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
229 {
230   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { }
231 };
232 
233 /***********************
234 *** Linear traversal ***
235 ***********************/
236 
237 template<typename Kernel, int Index, int Stop>
238 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
239 {
240   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
241   {
242     kernel.assignCoeff(Index);
243     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
244   }
245 };
246 
247 template<typename Kernel, int Stop>
248 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
249 {
250   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
251 };
252 
253 /**************************
254 *** Inner vectorization ***
255 **************************/
256 
257 template<typename Kernel, int Index, int Stop>
258 struct copy_using_evaluator_innervec_CompleteUnrolling
259 {
260   // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
261   typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
262   typedef typename DstEvaluatorType::XprType DstXprType;
263   typedef typename Kernel::PacketType PacketType;
264 
265   enum {
266     outer = Index / DstXprType::InnerSizeAtCompileTime,
267     inner = Index % DstXprType::InnerSizeAtCompileTime,
268     SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
269     DstAlignment = Kernel::AssignmentTraits::DstAlignment
270   };
271 
272   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
273   {
274     kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
275     enum { NextIndex = Index + unpacket_traits<PacketType>::size };
276     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
277   }
278 };
279 
280 template<typename Kernel, int Stop>
281 struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
282 {
283   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
284 };
285 
286 template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
287 struct copy_using_evaluator_innervec_InnerUnrolling
288 {
289   typedef typename Kernel::PacketType PacketType;
290   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
291   {
292     kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
293     enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
294     copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer);
295   }
296 };
297 
298 template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
299 struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment>
300 {
301   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
302 };
303 
304 /***************************************************************************
305 * Part 3 : implementation of all cases
306 ***************************************************************************/
307 
308 // dense_assignment_loop is based on assign_impl
309 
310 template<typename Kernel,
311          int Traversal = Kernel::AssignmentTraits::Traversal,
312          int Unrolling = Kernel::AssignmentTraits::Unrolling>
313 struct dense_assignment_loop;
314 
315 /************************
316 *** Default traversal ***
317 ************************/
318 
319 template<typename Kernel>
320 struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
321 {
322   EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel)
323   {
324     for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
325       for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
326         kernel.assignCoeffByOuterInner(outer, inner);
327       }
328     }
329   }
330 };
331 
332 template<typename Kernel>
333 struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
334 {
335   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
336   {
337     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
338     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
339   }
340 };
341 
342 template<typename Kernel>
343 struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
344 {
345   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
346   {
347     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
348 
349     const Index outerSize = kernel.outerSize();
350     for(Index outer = 0; outer < outerSize; ++outer)
351       copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
352   }
353 };
354 
355 /***************************
356 *** Linear vectorization ***
357 ***************************/
358 
359 
360 // The goal of unaligned_dense_assignment_loop is simply to factorize the handling
361 // of the non vectorizable beginning and ending parts
362 
363 template <bool IsAligned = false>
364 struct unaligned_dense_assignment_loop
365 {
366   // if IsAligned = true, then do nothing
367   template <typename Kernel>
368   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {}
369 };
370 
371 template <>
372 struct unaligned_dense_assignment_loop<false>
373 {
374   // MSVC must not inline this functions. If it does, it fails to optimize the
375   // packet access path.
376   // FIXME check which version exhibits this issue
377 #if EIGEN_COMP_MSVC
378   template <typename Kernel>
379   static EIGEN_DONT_INLINE void run(Kernel &kernel,
380                                     Index start,
381                                     Index end)
382 #else
383   template <typename Kernel>
384   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
385                                       Index start,
386                                       Index end)
387 #endif
388   {
389     for (Index index = start; index < end; ++index)
390       kernel.assignCoeff(index);
391   }
392 };
393 
394 template<typename Kernel>
395 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
396 {
397   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
398   {
399     const Index size = kernel.size();
400     typedef typename Kernel::Scalar Scalar;
401     typedef typename Kernel::PacketType PacketType;
402     enum {
403       requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
404       packetSize = unpacket_traits<PacketType>::size,
405       dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
406       dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
407                                                             : int(Kernel::AssignmentTraits::DstAlignment),
408       srcAlignment = Kernel::AssignmentTraits::JointAlignment
409     };
410     const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size);
411     const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
412 
413     unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
414 
415     for(Index index = alignedStart; index < alignedEnd; index += packetSize)
416       kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index);
417 
418     unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
419   }
420 };
421 
422 template<typename Kernel>
423 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
424 {
425   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
426   {
427     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
428     typedef typename Kernel::PacketType PacketType;
429 
430     enum { size = DstXprType::SizeAtCompileTime,
431            packetSize =unpacket_traits<PacketType>::size,
432            alignedSize = (size/packetSize)*packetSize };
433 
434     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
435     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
436   }
437 };
438 
439 /**************************
440 *** Inner vectorization ***
441 **************************/
442 
443 template<typename Kernel>
444 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
445 {
446   typedef typename Kernel::PacketType PacketType;
447   enum {
448     SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
449     DstAlignment = Kernel::AssignmentTraits::DstAlignment
450   };
451   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
452   {
453     const Index innerSize = kernel.innerSize();
454     const Index outerSize = kernel.outerSize();
455     const Index packetSize = unpacket_traits<PacketType>::size;
456     for(Index outer = 0; outer < outerSize; ++outer)
457       for(Index inner = 0; inner < innerSize; inner+=packetSize)
458         kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
459   }
460 };
461 
462 template<typename Kernel>
463 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
464 {
465   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
466   {
467     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
468     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
469   }
470 };
471 
472 template<typename Kernel>
473 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
474 {
475   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
476   {
477     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
478     typedef typename Kernel::AssignmentTraits Traits;
479     const Index outerSize = kernel.outerSize();
480     for(Index outer = 0; outer < outerSize; ++outer)
481       copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime,
482                                                    Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer);
483   }
484 };
485 
486 /***********************
487 *** Linear traversal ***
488 ***********************/
489 
490 template<typename Kernel>
491 struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
492 {
493   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
494   {
495     const Index size = kernel.size();
496     for(Index i = 0; i < size; ++i)
497       kernel.assignCoeff(i);
498   }
499 };
500 
501 template<typename Kernel>
502 struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
503 {
504   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
505   {
506     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
507     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
508   }
509 };
510 
511 /**************************
512 *** Slice vectorization ***
513 ***************************/
514 
515 template<typename Kernel>
516 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
517 {
518   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
519   {
520     typedef typename Kernel::Scalar Scalar;
521     typedef typename Kernel::PacketType PacketType;
522     enum {
523       packetSize = unpacket_traits<PacketType>::size,
524       requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment),
525       alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
526       dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
527       dstAlignment = alignable ? int(requestedAlignment)
528                                : int(Kernel::AssignmentTraits::DstAlignment)
529     };
530     const Scalar *dst_ptr = kernel.dstDataPtr();
531     if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
532     {
533       // the pointer is not aligend-on scalar, so alignment is not possible
534       return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
535     }
536     const Index packetAlignedMask = packetSize - 1;
537     const Index innerSize = kernel.innerSize();
538     const Index outerSize = kernel.outerSize();
539     const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
540     Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
541 
542     for(Index outer = 0; outer < outerSize; ++outer)
543     {
544       const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
545       // do the non-vectorizable part of the assignment
546       for(Index inner = 0; inner<alignedStart ; ++inner)
547         kernel.assignCoeffByOuterInner(outer, inner);
548 
549       // do the vectorizable part of the assignment
550       for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
551         kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner);
552 
553       // do the non-vectorizable part of the assignment
554       for(Index inner = alignedEnd; inner<innerSize ; ++inner)
555         kernel.assignCoeffByOuterInner(outer, inner);
556 
557       alignedStart = numext::mini((alignedStart+alignedStep)%packetSize, innerSize);
558     }
559   }
560 };
561 
562 #if EIGEN_UNALIGNED_VECTORIZE
563 template<typename Kernel>
564 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
565 {
566   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
567   {
568     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
569     typedef typename Kernel::PacketType PacketType;
570 
571     enum { size = DstXprType::InnerSizeAtCompileTime,
572            packetSize =unpacket_traits<PacketType>::size,
573            vectorizableSize = (size/packetSize)*packetSize };
574 
575     for(Index outer = 0; outer < kernel.outerSize(); ++outer)
576     {
577       copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer);
578       copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer);
579     }
580   }
581 };
582 #endif
583 
584 
585 /***************************************************************************
586 * Part 4 : Generic dense assignment kernel
587 ***************************************************************************/
588 
589 // This class generalize the assignment of a coefficient (or packet) from one dense evaluator
590 // to another dense writable evaluator.
591 // It is parametrized by the two evaluators, and the actual assignment functor.
592 // This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
593 // One can customize the assignment using this generic dense_assignment_kernel with different
594 // functors, or by completely overloading it, by-passing a functor.
595 template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
596 class generic_dense_assignment_kernel
597 {
598 protected:
599   typedef typename DstEvaluatorTypeT::XprType DstXprType;
600   typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
601 public:
602 
603   typedef DstEvaluatorTypeT DstEvaluatorType;
604   typedef SrcEvaluatorTypeT SrcEvaluatorType;
605   typedef typename DstEvaluatorType::Scalar Scalar;
606   typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
607   typedef typename AssignmentTraits::PacketType PacketType;
608 
609 
610   EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
611     : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
612   {
613     #ifdef EIGEN_DEBUG_ASSIGN
614     AssignmentTraits::debug();
615     #endif
616   }
617 
618   EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
619   EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
620   EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
621   EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
622   EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
623   EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
624 
625   EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
626   EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
627 
628   /// Assign src(row,col) to dst(row,col) through the assignment functor.
629   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
630   {
631     m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
632   }
633 
634   /// \sa assignCoeff(Index,Index)
635   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
636   {
637     m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
638   }
639 
640   /// \sa assignCoeff(Index,Index)
641   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
642   {
643     Index row = rowIndexByOuterInner(outer, inner);
644     Index col = colIndexByOuterInner(outer, inner);
645     assignCoeff(row, col);
646   }
647 
648 
649   template<int StoreMode, int LoadMode, typename PacketType>
650   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
651   {
652     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
653   }
654 
655   template<int StoreMode, int LoadMode, typename PacketType>
656   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
657   {
658     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
659   }
660 
661   template<int StoreMode, int LoadMode, typename PacketType>
662   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
663   {
664     Index row = rowIndexByOuterInner(outer, inner);
665     Index col = colIndexByOuterInner(outer, inner);
666     assignPacket<StoreMode,LoadMode,PacketType>(row, col);
667   }
668 
669   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
670   {
671     typedef typename DstEvaluatorType::ExpressionTraits Traits;
672     return int(Traits::RowsAtCompileTime) == 1 ? 0
673       : int(Traits::ColsAtCompileTime) == 1 ? inner
674       : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
675       : inner;
676   }
677 
678   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner)
679   {
680     typedef typename DstEvaluatorType::ExpressionTraits Traits;
681     return int(Traits::ColsAtCompileTime) == 1 ? 0
682       : int(Traits::RowsAtCompileTime) == 1 ? inner
683       : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
684       : outer;
685   }
686 
687   EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const
688   {
689     return m_dstExpr.data();
690   }
691 
692 protected:
693   DstEvaluatorType& m_dst;
694   const SrcEvaluatorType& m_src;
695   const Functor &m_functor;
696   // TODO find a way to avoid the needs of the original expression
697   DstXprType& m_dstExpr;
698 };
699 
700 /***************************************************************************
701 * Part 5 : Entry point for dense rectangular assignment
702 ***************************************************************************/
703 
704 template<typename DstXprType,typename SrcXprType, typename Functor>
705 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
706 void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/)
707 {
708   EIGEN_ONLY_USED_FOR_DEBUG(dst);
709   EIGEN_ONLY_USED_FOR_DEBUG(src);
710   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
711 }
712 
713 template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
714 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
715 void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/)
716 {
717   Index dstRows = src.rows();
718   Index dstCols = src.cols();
719   if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
720     dst.resize(dstRows, dstCols);
721   eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
722 }
723 
724 template<typename DstXprType, typename SrcXprType, typename Functor>
725 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
726 {
727   typedef evaluator<DstXprType> DstEvaluatorType;
728   typedef evaluator<SrcXprType> SrcEvaluatorType;
729 
730   SrcEvaluatorType srcEvaluator(src);
731 
732   // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
733   // we need to resize the destination after the source evaluator has been created.
734   resize_if_allowed(dst, src, func);
735 
736   DstEvaluatorType dstEvaluator(dst);
737 
738   typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
739   Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
740 
741   dense_assignment_loop<Kernel>::run(kernel);
742 }
743 
744 template<typename DstXprType, typename SrcXprType>
745 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
746 {
747   call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>());
748 }
749 
750 /***************************************************************************
751 * Part 6 : Generic assignment
752 ***************************************************************************/
753 
754 // Based on the respective shapes of the destination and source,
755 // the class AssignmentKind determine the kind of assignment mechanism.
756 // AssignmentKind must define a Kind typedef.
757 template<typename DstShape, typename SrcShape> struct AssignmentKind;
758 
759 // Assignement kind defined in this file:
760 struct Dense2Dense {};
761 struct EigenBase2EigenBase {};
762 
763 template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
764 template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
765 
766 // This is the main assignment class
767 template< typename DstXprType, typename SrcXprType, typename Functor,
768           typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
769           typename EnableIf = void>
770 struct Assignment;
771 
772 
773 // The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition.
774 // Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated.
775 // So this intermediate function removes everything related to "assume-aliasing" such that Assignment
776 // does not has to bother about these annoying details.
777 
778 template<typename Dst, typename Src>
779 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
780 void call_assignment(Dst& dst, const Src& src)
781 {
782   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
783 }
784 template<typename Dst, typename Src>
785 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
786 void call_assignment(const Dst& dst, const Src& src)
787 {
788   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
789 }
790 
791 // Deal with "assume-aliasing"
792 template<typename Dst, typename Src, typename Func>
793 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
794 void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0)
795 {
796   typename plain_matrix_type<Src>::type tmp(src);
797   call_assignment_no_alias(dst, tmp, func);
798 }
799 
800 template<typename Dst, typename Src, typename Func>
801 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
802 void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0)
803 {
804   call_assignment_no_alias(dst, src, func);
805 }
806 
807 // by-pass "assume-aliasing"
808 // When there is no aliasing, we require that 'dst' has been properly resized
809 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
810 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
811 void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
812 {
813   call_assignment_no_alias(dst.expression(), src, func);
814 }
815 
816 
817 template<typename Dst, typename Src, typename Func>
818 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
819 void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
820 {
821   enum {
822     NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
823                         || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)
824                       ) && int(Dst::SizeAtCompileTime) != 1
825   };
826 
827   typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
828   typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
829   ActualDstType actualDst(dst);
830 
831   // TODO check whether this is the right place to perform these checks:
832   EIGEN_STATIC_ASSERT_LVALUE(Dst)
833   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
834   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
835 
836   Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
837 }
838 template<typename Dst, typename Src>
839 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
840 void call_assignment_no_alias(Dst& dst, const Src& src)
841 {
842   call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
843 }
844 
845 template<typename Dst, typename Src, typename Func>
846 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
847 void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
848 {
849   // TODO check whether this is the right place to perform these checks:
850   EIGEN_STATIC_ASSERT_LVALUE(Dst)
851   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
852   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
853 
854   Assignment<Dst,Src,Func>::run(dst, src, func);
855 }
856 template<typename Dst, typename Src>
857 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
858 void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
859 {
860   call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>());
861 }
862 
863 // forward declaration
864 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
865 
866 // Generic Dense to Dense assignment
867 // Note that the last template argument "Weak" is needed to make it possible to perform
868 // both partial specialization+SFINAE without ambiguous specialization
869 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
870 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak>
871 {
872   EIGEN_DEVICE_FUNC
873   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
874   {
875 #ifndef EIGEN_NO_DEBUG
876     internal::check_for_aliasing(dst, src);
877 #endif
878 
879     call_dense_assignment_loop(dst, src, func);
880   }
881 };
882 
883 // Generic assignment through evalTo.
884 // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
885 // Note that the last template argument "Weak" is needed to make it possible to perform
886 // both partial specialization+SFINAE without ambiguous specialization
887 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
888 struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
889 {
890   EIGEN_DEVICE_FUNC
891   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
892   {
893     Index dstRows = src.rows();
894     Index dstCols = src.cols();
895     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
896       dst.resize(dstRows, dstCols);
897 
898     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
899     src.evalTo(dst);
900   }
901 
902   // NOTE The following two functions are templated to avoid their instanciation if not needed
903   //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
904   template<typename SrcScalarType>
905   EIGEN_DEVICE_FUNC
906   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
907   {
908     Index dstRows = src.rows();
909     Index dstCols = src.cols();
910     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
911       dst.resize(dstRows, dstCols);
912 
913     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
914     src.addTo(dst);
915   }
916 
917   template<typename SrcScalarType>
918   EIGEN_DEVICE_FUNC
919   static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/)
920   {
921     Index dstRows = src.rows();
922     Index dstCols = src.cols();
923     if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
924       dst.resize(dstRows, dstCols);
925 
926     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
927     src.subTo(dst);
928   }
929 };
930 
931 } // namespace internal
932 
933 } // end namespace Eigen
934 
935 #endif // EIGEN_ASSIGN_EVALUATOR_H
936