1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_PACKET_MATH_SSE_H 11 #define EIGEN_PACKET_MATH_SSE_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 19 #endif 20 21 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 22 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) 23 #endif 24 25 typedef __m128 Packet4f; 26 typedef __m128i Packet4i; 27 typedef __m128d Packet2d; 28 29 template<> struct is_arithmetic<__m128> { enum { value = true }; }; 30 template<> struct is_arithmetic<__m128i> { enum { value = true }; }; 31 template<> struct is_arithmetic<__m128d> { enum { value = true }; }; 32 33 #define vec4f_swizzle1(v,p,q,r,s) \ 34 (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) 35 36 #define vec4i_swizzle1(v,p,q,r,s) \ 37 (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) 38 39 #define vec2d_swizzle1(v,p,q) \ 40 (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) 41 42 #define vec4f_swizzle2(a,b,p,q,r,s) \ 43 (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) 44 45 #define vec4i_swizzle2(a,b,p,q,r,s) \ 46 (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) 47 48 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 49 const Packet4f p4f_##NAME = pset1<Packet4f>(X) 50 51 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ 52 const Packet2d p2d_##NAME = pset1<Packet2d>(X) 53 54 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 55 const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X)) 56 57 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 58 const Packet4i p4i_##NAME = pset1<Packet4i>(X) 59 60 61 template<> struct packet_traits<float> : default_packet_traits 62 { 63 typedef Packet4f type; 64 enum { 65 Vectorizable = 1, 66 AlignedOnScalar = 1, 67 size=4, 68 69 HasDiv = 1, 70 HasSin = EIGEN_FAST_MATH, 71 HasCos = EIGEN_FAST_MATH, 72 HasLog = 1, 73 HasExp = 1, 74 HasSqrt = 1 75 }; 76 }; 77 template<> struct packet_traits<double> : default_packet_traits 78 { 79 typedef Packet2d type; 80 enum { 81 Vectorizable = 1, 82 AlignedOnScalar = 1, 83 size=2, 84 85 HasDiv = 1, 86 HasExp = 1, 87 HasSqrt = 1 88 }; 89 }; 90 template<> struct packet_traits<int> : default_packet_traits 91 { 92 typedef Packet4i type; 93 enum { 94 // FIXME check the Has* 95 Vectorizable = 1, 96 AlignedOnScalar = 1, 97 size=4 98 }; 99 }; 100 101 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; 102 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; }; 103 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; 104 105 #if defined(_MSC_VER) && (_MSC_VER==1500) 106 // Workaround MSVC 9 internal compiler error. 107 // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode 108 // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). 109 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); } 110 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } 111 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); } 112 #else 113 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); } 114 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } 115 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } 116 #endif 117 118 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } 119 template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } 120 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } 121 122 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } 123 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } 124 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } 125 126 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } 127 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } 128 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } 129 130 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) 131 { 132 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); 133 return _mm_xor_ps(a,mask); 134 } 135 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) 136 { 137 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); 138 return _mm_xor_pd(a,mask); 139 } 140 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) 141 { 142 return psub(_mm_setr_epi32(0,0,0,0), a); 143 } 144 145 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 146 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } 147 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 148 149 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } 150 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } 151 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) 152 { 153 #ifdef EIGEN_VECTORIZE_SSE4_1 154 return _mm_mullo_epi32(a,b); 155 #else 156 // this version is slightly faster than 4 scalar products 157 return vec4i_swizzle1( 158 vec4i_swizzle2( 159 _mm_mul_epu32(a,b), 160 _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), 161 vec4i_swizzle1(b,1,0,3,2)), 162 0,2,0,2), 163 0,2,1,3); 164 #endif 165 } 166 167 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } 168 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } 169 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 170 { eigen_assert(false && "packet integer division are not supported by SSE"); 171 return pset1<Packet4i>(0); 172 } 173 174 // for some weird raisons, it has to be overloaded for packet of integers 175 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } 176 177 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } 178 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } 179 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) 180 { 181 #ifdef EIGEN_VECTORIZE_SSE4_1 182 return _mm_min_epi32(a,b); 183 #else 184 // after some bench, this version *is* faster than a scalar implementation 185 Packet4i mask = _mm_cmplt_epi32(a,b); 186 return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); 187 #endif 188 } 189 190 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } 191 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } 192 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) 193 { 194 #ifdef EIGEN_VECTORIZE_SSE4_1 195 return _mm_max_epi32(a,b); 196 #else 197 // after some bench, this version *is* faster than a scalar implementation 198 Packet4i mask = _mm_cmpgt_epi32(a,b); 199 return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); 200 #endif 201 } 202 203 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } 204 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } 205 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } 206 207 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } 208 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } 209 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } 210 211 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } 212 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } 213 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } 214 215 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } 216 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } 217 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } 218 219 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { 220 EIGEN_DEBUG_ALIGNED_LOAD 221 #ifdef EIGEN_ANDROID_SSE_WR 222 // Workaround for X86 on Android crash on aligned operation. 223 return _mm_loadu_ps(from); 224 #else 225 return _mm_load_ps(from); 226 #endif 227 } 228 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { 229 EIGEN_DEBUG_ALIGNED_LOAD 230 #ifdef EIGEN_ANDROID_SSE_WR 231 // Workaround for X86 on Android crash on aligned operation. 232 return _mm_loadu_pd(from); 233 #else 234 return _mm_load_pd(from); 235 #endif 236 } 237 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { 238 EIGEN_DEBUG_ALIGNED_LOAD 239 #ifdef EIGEN_ANDROID_SSE_WR 240 // Workaround for X86 on Android crash on aligned operation. 241 return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); 242 #else 243 return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); 244 #endif 245 } 246 247 #if defined(_MSC_VER) 248 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { 249 EIGEN_DEBUG_UNALIGNED_LOAD 250 #if (_MSC_VER==1600) 251 // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps 252 // (i.e., it does not generate an unaligned load!! 253 // TODO On most architectures this version should also be faster than a single _mm_loadu_ps 254 // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so... 255 __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); 256 res = _mm_loadh_pi(res, (const __m64*)(from+2)); 257 return res; 258 #else 259 return _mm_loadu_ps(from); 260 #endif 261 } 262 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } 263 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } 264 #else 265 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would 266 // require pointer casting to incompatible pointer types and leads to invalid code 267 // because of the strict aliasing rule. The "dummy" stuff are required to enforce 268 // a correct instruction dependency. 269 // TODO: do the same for MSVC (ICC is compatible) 270 // NOTE: with the code below, MSVC's compiler crashes! 271 272 #if defined(__GNUC__) && defined(__i386__) 273 // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd 274 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 275 #elif defined(__clang__) 276 // bug 201: Segfaults in __mm_loadh_pd with clang 2.8 277 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 278 #else 279 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0 280 #endif 281 282 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 283 { 284 EIGEN_DEBUG_UNALIGNED_LOAD 285 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 286 return _mm_loadu_ps(from); 287 #else 288 __m128d res; 289 #ifdef EIGEN_ANDROID_SSE_WR 290 // Workaround for X86 on Android crash on aligned operation. 291 res = _mm_loadu_sd((const double*)(from)) ; 292 #else 293 res = _mm_load_sd((const double*)(from)) ; 294 #endif 295 res = _mm_loadh_pd(res, (const double*)(from+2)) ; 296 return _mm_castpd_ps(res); 297 #endif 298 } 299 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) 300 { 301 EIGEN_DEBUG_UNALIGNED_LOAD 302 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 303 return _mm_loadu_pd(from); 304 #else 305 __m128d res; 306 #ifdef EIGEN_ANDROID_SSE_WR 307 // Workaround for X86 on Android crash on aligned operation. 308 res = _mm_loadu_sd(from) ; 309 #else 310 res = _mm_load_sd(from) ; 311 #endif 312 res = _mm_loadh_pd(res,from+1); 313 return res; 314 #endif 315 } 316 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 317 { 318 EIGEN_DEBUG_UNALIGNED_LOAD 319 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 320 return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); 321 #else 322 __m128d res; 323 #ifdef EIGEN_ANDROID_SSE_WR 324 // Workaround for X86 on Android crash on aligned operation. 325 res = _mm_loadu_sd((const double*)(from)) ; 326 #else 327 res = _mm_load_sd((const double*)(from)) ; 328 #endif 329 res = _mm_loadh_pd(res, (const double*)(from+2)) ; 330 return _mm_castpd_si128(res); 331 #endif 332 } 333 #endif 334 335 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 336 { 337 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1); 338 } 339 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) 340 { return pset1<Packet2d>(from[0]); } 341 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 342 { 343 Packet4i tmp; 344 tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from)); 345 return vec4i_swizzle1(tmp, 0, 0, 1, 1); 346 } 347 348 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { 349 EIGEN_DEBUG_ALIGNED_STORE 350 #ifdef EIGEN_ANDROID_SSE_WR 351 // Workaround for X86 on Android crash on aligned operation. 352 _mm_storeu_ps(to, from); 353 #else 354 _mm_store_ps(to, from); 355 #endif 356 } 357 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { 358 EIGEN_DEBUG_ALIGNED_STORE 359 #ifdef EIGEN_ANDROID_SSE_WR 360 // Workaround for X86 on Android crash on aligned operation. 361 _mm_storeu_pd(to, from); 362 #else 363 _mm_store_pd(to, from); 364 #endif 365 } 366 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { 367 EIGEN_DEBUG_ALIGNED_STORE 368 #ifdef EIGEN_ANDROID_SSE_WR 369 // Workaround for X86 on Android crash on aligned operation. 370 _mm_storeu_si128(reinterpret_cast<Packet4i*>(to), from); 371 #else 372 _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); 373 #endif 374 } 375 376 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { 377 EIGEN_DEBUG_UNALIGNED_STORE 378 _mm_storel_pd((to), from); 379 _mm_storeh_pd((to+1), from); 380 } 381 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); } 382 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); } 383 384 // some compilers might be tempted to perform multiple moves instead of using a vector path. 385 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) 386 { 387 Packet4f pa = _mm_set_ss(a); 388 pstore(to, vec4f_swizzle1(pa,0,0,0,0)); 389 } 390 // some compilers might be tempted to perform multiple moves instead of using a vector path. 391 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) 392 { 393 Packet2d pa = _mm_set_sd(a); 394 pstore(to, vec2d_swizzle1(pa,0,0)); 395 } 396 397 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 398 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 399 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 400 401 #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) 402 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 403 // Direct of the struct members fixed bug #62. 404 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; } 405 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; } 406 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } 407 #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER) 408 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 409 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } 410 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } 411 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } 412 #else 413 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); } 414 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); } 415 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); } 416 #endif 417 418 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) 419 { return _mm_shuffle_ps(a,a,0x1B); } 420 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) 421 { return _mm_shuffle_pd(a,a,0x1); } 422 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) 423 { return _mm_shuffle_epi32(a,0x1B); } 424 425 426 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) 427 { 428 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); 429 return _mm_and_ps(a,mask); 430 } 431 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) 432 { 433 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); 434 return _mm_and_pd(a,mask); 435 } 436 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) 437 { 438 #ifdef EIGEN_VECTORIZE_SSSE3 439 return _mm_abs_epi32(a); 440 #else 441 Packet4i aux = _mm_srai_epi32(a,31); 442 return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); 443 #endif 444 } 445 446 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) 447 { 448 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); 449 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); 450 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); 451 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); 452 } 453 454 #ifdef EIGEN_VECTORIZE_SSE3 455 // TODO implement SSE2 versions as well as integer versions 456 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 457 { 458 return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); 459 } 460 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) 461 { 462 return _mm_hadd_pd(vecs[0], vecs[1]); 463 } 464 // SSSE3 version: 465 // EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) 466 // { 467 // return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); 468 // } 469 470 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 471 { 472 Packet4f tmp0 = _mm_hadd_ps(a,a); 473 return pfirst(_mm_hadd_ps(tmp0, tmp0)); 474 } 475 476 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } 477 478 // SSSE3 version: 479 // EIGEN_STRONG_INLINE float predux(const Packet4i& a) 480 // { 481 // Packet4i tmp0 = _mm_hadd_epi32(a,a); 482 // return pfirst(_mm_hadd_epi32(tmp0, tmp0)); 483 // } 484 #else 485 // SSE2 versions 486 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 487 { 488 Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); 489 return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); 490 } 491 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) 492 { 493 return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); 494 } 495 496 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 497 { 498 Packet4f tmp0, tmp1, tmp2; 499 tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); 500 tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); 501 tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); 502 tmp0 = _mm_add_ps(tmp0, tmp1); 503 tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); 504 tmp1 = _mm_add_ps(tmp1, tmp2); 505 tmp2 = _mm_movehl_ps(tmp1, tmp0); 506 tmp0 = _mm_movelh_ps(tmp0, tmp1); 507 return _mm_add_ps(tmp0, tmp2); 508 } 509 510 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) 511 { 512 return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); 513 } 514 #endif // SSE3 515 516 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 517 { 518 Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); 519 return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); 520 } 521 522 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 523 { 524 Packet4i tmp0, tmp1, tmp2; 525 tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); 526 tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); 527 tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); 528 tmp0 = _mm_add_epi32(tmp0, tmp1); 529 tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); 530 tmp1 = _mm_add_epi32(tmp1, tmp2); 531 tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); 532 tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); 533 return _mm_add_epi32(tmp0, tmp2); 534 } 535 536 // Other reduction functions: 537 538 // mul 539 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 540 { 541 Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); 542 return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); 543 } 544 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) 545 { 546 return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); 547 } 548 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 549 { 550 // after some experiments, it is seems this is the fastest way to implement it 551 // for GCC (eg., reusing pmul is very slow !) 552 // TODO try to call _mm_mul_epu32 directly 553 EIGEN_ALIGN16 int aux[4]; 554 pstore(aux, a); 555 return (aux[0] * aux[1]) * (aux[2] * aux[3]);; 556 } 557 558 // min 559 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 560 { 561 Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); 562 return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); 563 } 564 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) 565 { 566 return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); 567 } 568 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 569 { 570 // after some experiments, it is seems this is the fastest way to implement it 571 // for GCC (eg., it does not like using std::min after the pstore !!) 572 EIGEN_ALIGN16 int aux[4]; 573 pstore(aux, a); 574 int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; 575 int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; 576 return aux0<aux2 ? aux0 : aux2; 577 } 578 579 // max 580 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 581 { 582 Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); 583 return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); 584 } 585 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) 586 { 587 return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); 588 } 589 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 590 { 591 // after some experiments, it is seems this is the fastest way to implement it 592 // for GCC (eg., it does not like using std::min after the pstore !!) 593 EIGEN_ALIGN16 int aux[4]; 594 pstore(aux, a); 595 int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; 596 int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; 597 return aux0>aux2 ? aux0 : aux2; 598 } 599 600 #if (defined __GNUC__) 601 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) 602 // { 603 // Packet4f res = b; 604 // asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); 605 // return res; 606 // } 607 // EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) 608 // { 609 // Packet4i res = a; 610 // asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); 611 // return res; 612 // } 613 #endif 614 615 #ifdef EIGEN_VECTORIZE_SSSE3 616 // SSSE3 versions 617 template<int Offset> 618 struct palign_impl<Offset,Packet4f> 619 { 620 static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 621 { 622 if (Offset!=0) 623 first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); 624 } 625 }; 626 627 template<int Offset> 628 struct palign_impl<Offset,Packet4i> 629 { 630 static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 631 { 632 if (Offset!=0) 633 first = _mm_alignr_epi8(second,first, Offset*4); 634 } 635 }; 636 637 template<int Offset> 638 struct palign_impl<Offset,Packet2d> 639 { 640 static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) 641 { 642 if (Offset==1) 643 first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); 644 } 645 }; 646 #else 647 // SSE2 versions 648 template<int Offset> 649 struct palign_impl<Offset,Packet4f> 650 { 651 static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 652 { 653 if (Offset==1) 654 { 655 first = _mm_move_ss(first,second); 656 first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); 657 } 658 else if (Offset==2) 659 { 660 first = _mm_movehl_ps(first,first); 661 first = _mm_movelh_ps(first,second); 662 } 663 else if (Offset==3) 664 { 665 first = _mm_move_ss(first,second); 666 first = _mm_shuffle_ps(first,second,0x93); 667 } 668 } 669 }; 670 671 template<int Offset> 672 struct palign_impl<Offset,Packet4i> 673 { 674 static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 675 { 676 if (Offset==1) 677 { 678 first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); 679 first = _mm_shuffle_epi32(first,0x39); 680 } 681 else if (Offset==2) 682 { 683 first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); 684 first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); 685 } 686 else if (Offset==3) 687 { 688 first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); 689 first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); 690 } 691 } 692 }; 693 694 template<int Offset> 695 struct palign_impl<Offset,Packet2d> 696 { 697 static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) 698 { 699 if (Offset==1) 700 { 701 first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); 702 first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); 703 } 704 } 705 }; 706 #endif 707 708 } // end namespace internal 709 710 } // end namespace Eigen 711 712 #endif // EIGEN_PACKET_MATH_SSE_H 713