Vc/include/Vc/sse/vector.tcc

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2010-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #include "limits.h"
  21 #include "../common/bitscanintrinsics.h"
  22 #include "macros.h"
  23
  24 namespace Vc
  25 {
  26 ALIGN(64) extern unsigned int RandomState[16];
  27
  28 namespace SSE
  29 {
  30
  31 template<typename T, int Size> static inline const T *_IndexesFromZero() {
  32     if (Size == 4) {
  33         return reinterpret_cast<const T *>(_IndexesFromZero4);
  34     } else if (Size == 8) {
  35         return reinterpret_cast<const T *>(_IndexesFromZero8);
  36     } else if (Size == 16) {
  37         return reinterpret_cast<const T *>(_IndexesFromZero16);
  38     }
  39     return 0;
  40 }
  41
  42 ///////////////////////////////////////////////////////////////////////////////////////////
  43 // constants {{{1
  44 template<typename T> inline Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum)
  45     : d(VectorHelper<VectorType>::zero())
  46 {
  47 }
  48
  49 template<typename T> inline Vector<T>::Vector(VectorSpecialInitializerOne::OEnum)
  50     : d(VectorHelper<T>::one())
  51 {
  52 }
  53
  54 template<typename T> inline Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum)
  55     : d(VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned))
  56 {
  57 }
  58
  59 template<typename T> inline Vector<T> Vector<T>::Zero()
  60 {
  61     return VectorHelper<VectorType>::zero();
  62 }
  63
  64 template<typename T> inline Vector<T> Vector<T>::One()
  65 {
  66     return VectorHelper<T>::one();
  67 }
  68
  69 template<typename T> inline Vector<T> Vector<T>::IndexesFromZero()
  70 {
  71     return VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned);
  72 }
  73
  74 // conversion/casts {{{1
  75 template<typename T> template<typename OtherT> inline INTRINSIC Vector<T>::Vector(const Vector<OtherT> &x)
  76     : d(StaticCastHelper<OtherT, T>::cast(x.data()))
  77 {
  78 }
  79
  80 template<> template<> inline INTRINSIC short_v &Vector<short>::operator=(const ushort_v &x) {
  81     data() = StaticCastHelper<unsigned short, short>::cast(x.data()); return *this;
  82 }
  83 template<> template<> inline INTRINSIC ushort_v &Vector<unsigned short>::operator=(const short_v &x) {
  84     data() = StaticCastHelper<short, unsigned short>::cast(x.data()); return *this;
  85 }
  86 template<> template<> inline INTRINSIC int_v &Vector<int>::operator=(const uint_v &x) {
  87     data() = StaticCastHelper<unsigned int, int>::cast(x.data()); return *this;
  88 }
  89 template<> template<> inline INTRINSIC uint_v &Vector<unsigned int>::operator=(const int_v &x) {
  90     data() = StaticCastHelper<int, unsigned int>::cast(x.data()); return *this;
  91 }
  92
  93 // broadcasts {{{1
  94 template<typename T> inline Vector<T>::Vector(EntryType a)
  95     : d(VectorHelper<T>::set(a))
  96 {
  97 }
  98
  99 ///////////////////////////////////////////////////////////////////////////////////////////
 100 // load ctors {{{1
 101 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); }
 102 template<typename T> template<typename A> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); }
 103 template<typename T> template<typename OtherT> inline ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); }
 104 template<typename T> template<typename OtherT, typename A> inline ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); }
 105
 106 ///////////////////////////////////////////////////////////////////////////////////////////
 107 // load member functions {{{1
 108 template<typename T> inline void INTRINSIC Vector<T>::load(const EntryType *mem)
 109 {
 110     load(mem, Aligned);
 111 }
 112
 113 template<typename T> template<typename A> inline void INTRINSIC Vector<T>::load(const EntryType *mem, A align)
 114 {
 115     d.v() = VectorHelper<VectorType>::load(mem, align);
 116 }
 117
 118 template<typename T> template<typename OtherT> inline void INTRINSIC Vector<T>::load(const OtherT *mem)
 119 {
 120     load(mem, Aligned);
 121 }
 122
 123 // float8: simply use the float implementation twice {{{2
 124 template<> template<typename OtherT, typename A> inline void INTRINSIC Vector<float8>::load(const OtherT *x, A a)
 125 {
 126     d.v() = M256::create(
 127             Vector<float>(&x[0], a).data(),
 128             Vector<float>(&x[4], a).data()
 129             );
 130 }
 131
 132 // LoadHelper {{{2
 133 template<typename DstT, typename SrcT, typename Flags> struct LoadHelper;
 134
 135 // float {{{2
 136 template<typename Flags> struct LoadHelper<float, double, Flags> {
 137     static inline __m128 load(const double *mem, Flags f)
 138     {
 139         return _mm_movelh_ps(_mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[0], f)),
 140                              _mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[2], f)));
 141     }
 142 };
 143 template<typename Flags> struct LoadHelper<float, unsigned int, Flags> {
 144     static inline __m128 load(const unsigned int *mem, Flags f)
 145     {
 146         return StaticCastHelper<unsigned int, float>::cast(VectorHelper<__m128i>::load(mem, f));
 147     }
 148 };
 149 template<typename Flags> struct LoadHelper<float, int, Flags> {
 150     static inline __m128 load(const int *mem, Flags f)
 151     {
 152         return StaticCastHelper<int, float>::cast(VectorHelper<__m128i>::load(mem, f));
 153     }
 154 };
 155 template<typename Flags> struct LoadHelper<float, unsigned short, Flags> {
 156     static inline __m128 load(const unsigned short *mem, Flags f)
 157     {
 158         return _mm_cvtepi32_ps(LoadHelper<int, unsigned short, Flags>::load(mem, f));
 159     }
 160 };
 161 template<typename Flags> struct LoadHelper<float, short, Flags> {
 162     static inline __m128 load(const short *mem, Flags f)
 163     {
 164         return _mm_cvtepi32_ps(LoadHelper<int, short, Flags>::load(mem, f));
 165     }
 166 };
 167 template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
 168     static inline __m128 load(const unsigned char *mem, Flags f)
 169     {
 170         return _mm_cvtepi32_ps(LoadHelper<int, unsigned char, Flags>::load(mem, f));
 171     }
 172 };
 173 template<typename Flags> struct LoadHelper<float, signed char, Flags> {
 174     static inline __m128 load(const signed char *mem, Flags f)
 175     {
 176         return _mm_cvtepi32_ps(LoadHelper<int, signed char, Flags>::load(mem, f));
 177     }
 178 };
 179
 180 // int {{{2
 181 template<typename Flags> struct LoadHelper<int, unsigned int, Flags> {
 182     static inline __m128i load(const unsigned int *mem, Flags f)
 183     {
 184         return VectorHelper<__m128i>::load(mem, f);
 185     }
 186 };
 187 // no difference between streaming and alignment, because the
 188 // 32/64 bit loads are not available as streaming loads, and can always be unaligned
 189 template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
 190     static inline __m128i load(const unsigned short *mem, Flags)
 191     {
 192         return _mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 193     }
 194 };
 195 template<typename Flags> struct LoadHelper<int, short, Flags> {
 196     static inline __m128i load(const short *mem, Flags)
 197     {
 198         return _mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 199     }
 200 };
 201 template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
 202     static inline __m128i load(const unsigned char *mem, Flags)
 203     {
 204         return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
 205     }
 206 };
 207 template<typename Flags> struct LoadHelper<int, signed char, Flags> {
 208     static inline __m128i load(const signed char *mem, Flags)
 209     {
 210         return _mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
 211     }
 212 };
 213
 214 // unsigned int {{{2
 215 template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
 216     static inline __m128i load(const unsigned short *mem, Flags)
 217     {
 218         return _mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 219     }
 220 };
 221 template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
 222     static inline __m128i load(const unsigned char *mem, Flags)
 223     {
 224         return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
 225     }
 226 };
 227
 228 // short {{{2
 229 template<typename Flags> struct LoadHelper<short, unsigned short, Flags> {
 230     static inline __m128i load(const unsigned short *mem, Flags f)
 231     {
 232         return VectorHelper<__m128i>::load(mem, f);
 233     }
 234 };
 235 template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
 236     static inline __m128i load(const unsigned char *mem, Flags)
 237     {
 238         return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 239     }
 240 };
 241 template<typename Flags> struct LoadHelper<short, signed char, Flags> {
 242     static inline __m128i load(const signed char *mem, Flags)
 243     {
 244         return _mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 245     }
 246 };
 247
 248 // unsigned short {{{2
 249 template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
 250     static inline __m128i load(const unsigned char *mem, Flags)
 251     {
 252         return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 253     }
 254 };
 255
 256 // general load, implemented via LoadHelper {{{2
 257 template<typename DstT> template<typename SrcT, typename Flags> inline void INTRINSIC Vector<DstT>::load(const SrcT *x, Flags f)
 258 {
 259     d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f);
 260 }
 261
 262 ///////////////////////////////////////////////////////////////////////////////////////////
 263 // expand/combine {{{1
 264 template<typename T> inline Vector<T>::Vector(const Vector<typename CtorTypeHelper<T>::Type> *a)
 265     : d(VectorHelper<T>::concat(a[0].data(), a[1].data()))
 266 {
 267 }
 268
 269 template<typename T> inline void Vector<T>::expand(Vector<typename ExpandTypeHelper<T>::Type> *x) const
 270 {
 271     if (Size == 8u) {
 272         x[0].data() = VectorHelper<T>::expand0(data());
 273         x[1].data() = VectorHelper<T>::expand1(data());
 274     }
 275 }
 276
 277 ///////////////////////////////////////////////////////////////////////////////////////////
 278 // zeroing {{{1
 279 template<typename T> inline void Vector<T>::setZero()
 280 {
 281     data() = VectorHelper<VectorType>::zero();
 282 }
 283
 284 template<typename T> inline void Vector<T>::setZero(const Mask &k)
 285 {
 286     data() = VectorHelper<VectorType>::andnot_(mm128_reinterpret_cast<VectorType>(k.data()), data());
 287 }
 288
 289 template<> inline void INTRINSIC Vector<double>::setQnan()
 290 {
 291     data() = _mm_setallone_pd();
 292 }
 293 template<> inline void INTRINSIC Vector<double>::setQnan(Mask::Argument k)
 294 {
 295     data() = _mm_or_pd(data(), k.dataD());
 296 }
 297 template<> inline void INTRINSIC Vector<float>::setQnan()
 298 {
 299     data() = _mm_setallone_ps();
 300 }
 301 template<> inline void INTRINSIC Vector<float>::setQnan(Mask::Argument k)
 302 {
 303     data() = _mm_or_ps(data(), k.data());
 304 }
 305 template<> inline void INTRINSIC Vector<float8>::setQnan()
 306 {
 307     d.v()[0] = _mm_setallone_ps();
 308     d.v()[1] = _mm_setallone_ps();
 309 }
 310 template<> inline void INTRINSIC Vector<float8>::setQnan(Mask::Argument k)
 311 {
 312     d.v()[0] = _mm_or_ps(d.v()[0], k.data()[0]);
 313     d.v()[1] = _mm_or_ps(d.v()[1], k.data()[1]);
 314 }
 315
 316 ///////////////////////////////////////////////////////////////////////////////////////////
 317 // stores {{{1
 318 template<typename T> inline void Vector<T>::store(EntryType *mem) const
 319 {
 320     VectorHelper<VectorType>::store(mem, data(), Aligned);
 321 }
 322
 323 template<typename T> inline void Vector<T>::store(EntryType *mem, const Mask &mask) const
 324 {
 325     VectorHelper<VectorType>::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), Aligned);
 326 }
 327
 328 template<typename T> template<typename A> inline void Vector<T>::store(EntryType *mem, A align) const
 329 {
 330     VectorHelper<VectorType>::store(mem, data(), align);
 331 }
 332
 333 template<typename T> template<typename A> inline void Vector<T>::store(EntryType *mem, const Mask &mask, A align) const
 334 {
 335     HV::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), align);
 336 }
 337
 338 ///////////////////////////////////////////////////////////////////////////////////////////
 339 // division {{{1
 340 template<typename T> inline INTRINSIC CONST Vector<T> &WriteMaskedVector<T>::operator/=(const Vector<T> &x)
 341 {
 342     return operator=(*vec / x);
 343 }
 344 template<> inline INTRINSIC CONST int_v &WriteMaskedVector<int>::operator/=(const int_v &x)
 345 {
 346     Vc_foreach_bit (int i, mask) {
 347         vec->d.m(i) /= x.d.m(i);
 348     }
 349     return *vec;
 350 }
 351 template<> inline INTRINSIC CONST uint_v &WriteMaskedVector<unsigned int>::operator/=(const uint_v &x)
 352 {
 353     Vc_foreach_bit (int i, mask) {
 354         vec->d.m(i) /= x.d.m(i);
 355     }
 356     return *vec;
 357 }
 358 template<> inline INTRINSIC CONST short_v &WriteMaskedVector<short>::operator/=(const short_v &x)
 359 {
 360     Vc_foreach_bit (int i, mask) {
 361         vec->d.m(i) /= x.d.m(i);
 362     }
 363     return *vec;
 364 }
 365 template<> inline INTRINSIC CONST ushort_v &WriteMaskedVector<unsigned short>::operator/=(const ushort_v &x)
 366 {
 367     Vc_foreach_bit (int i, mask) {
 368         vec->d.m(i) /= x.d.m(i);
 369     }
 370     return *vec;
 371 }
 372
 373 template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x)
 374 {
 375     if (VectorTraits<T>::HasVectorDivision) {
 376         return operator/=(Vector<T>(x));
 377     }
 378     for_all_vector_entries(i,
 379             d.m(i) /= x;
 380             );
 381     return *this;
 382 }
 383
 384 template<typename T> template<typename TT> inline PURE INTRINSIC VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const
 385 {
 386     if (VectorTraits<T>::HasVectorDivision) {
 387         return operator/(Vector<T>(x));
 388     }
 389     Vector<T> r;
 390     for_all_vector_entries(i,
 391             r.d.m(i) = d.m(i) / x;
 392             );
 393     return r;
 394 }
 395
 396 template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x)
 397 {
 398     for_all_vector_entries(i,
 399             d.m(i) /= x.d.m(i);
 400             );
 401     return *this;
 402 }
 403
 404 template<typename T> inline Vector<T> Vector<T>::operator/(const Vector<T> &x) const
 405 {
 406     Vector<T> r;
 407     for_all_vector_entries(i,
 408             r.d.m(i) = d.m(i) / x.d.m(i);
 409             );
 410     return r;
 411 }
 412
 413 template<> inline Vector<short> &Vector<short>::operator/=(const Vector<short> &x)
 414 {
 415     __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
 416     __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
 417     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
 418     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
 419     d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
 420     return *this;
 421 }
 422
 423 template<> inline Vector<short> ALWAYS_INLINE Vector<short>::operator/(const Vector<short> &x) const
 424 {
 425     __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
 426     __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
 427     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
 428     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
 429     return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
 430 }
 431
 432 template<> inline Vector<unsigned short> &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x)
 433 {
 434     __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
 435     __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
 436     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
 437     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
 438     d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
 439     return *this;
 440 }
 441
 442 template<> inline Vector<unsigned short> ALWAYS_INLINE Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const
 443 {
 444     __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
 445     __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
 446     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
 447     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
 448     return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
 449 }
 450
 451 template<> inline Vector<float> &Vector<float>::operator/=(const Vector<float> &x)
 452 {
 453     d.v() = _mm_div_ps(d.v(), x.d.v());
 454     return *this;
 455 }
 456
 457 template<> inline Vector<float> Vector<float>::operator/(const Vector<float> &x) const
 458 {
 459     return _mm_div_ps(d.v(), x.d.v());
 460 }
 461
 462 template<> inline Vector<float8> &Vector<float8>::operator/=(const Vector<float8> &x)
 463 {
 464     d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]);
 465     d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]);
 466     return *this;
 467 }
 468
 469 template<> inline Vector<float8> Vector<float8>::operator/(const Vector<float8> &x) const
 470 {
 471     Vector<float8> r;
 472     r.d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]);
 473     r.d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]);
 474     return r;
 475 }
 476
 477 template<> inline Vector<double> &Vector<double>::operator/=(const Vector<double> &x)
 478 {
 479     d.v() = _mm_div_pd(d.v(), x.d.v());
 480     return *this;
 481 }
 482
 483 template<> inline Vector<double> Vector<double>::operator/(const Vector<double> &x) const
 484 {
 485     return _mm_div_pd(d.v(), x.d.v());
 486 }
 487
 488 ///////////////////////////////////////////////////////////////////////////////////////////
 489 // operator- {{{1
 490 template<> inline Vector<double> PURE ALWAYS_INLINE FLATTEN Vector<double>::operator-() const
 491 {
 492     return _mm_xor_pd(d.v(), _mm_setsignmask_pd());
 493 }
 494 template<> inline Vector<float> PURE ALWAYS_INLINE FLATTEN Vector<float>::operator-() const
 495 {
 496     return _mm_xor_ps(d.v(), _mm_setsignmask_ps());
 497 }
 498 template<> inline Vector<float8> PURE ALWAYS_INLINE FLATTEN Vector<float8>::operator-() const
 499 {
 500     return M256::create(
 501             _mm_xor_ps(d.v()[0], _mm_setsignmask_ps()),
 502             _mm_xor_ps(d.v()[1], _mm_setsignmask_ps()));
 503 }
 504 template<> inline Vector<int> PURE ALWAYS_INLINE FLATTEN Vector<int>::operator-() const
 505 {
 506 #ifdef VC_IMPL_SSSE3
 507     return _mm_sign_epi32(d.v(), _mm_setallone_si128());
 508 #else
 509     return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32());
 510 #endif
 511 }
 512 template<> inline Vector<int> PURE ALWAYS_INLINE FLATTEN Vector<unsigned int>::operator-() const
 513 {
 514 #ifdef VC_IMPL_SSSE3
 515     return _mm_sign_epi32(d.v(), _mm_setallone_si128());
 516 #else
 517     return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32());
 518 #endif
 519 }
 520 template<> inline Vector<short> PURE ALWAYS_INLINE FLATTEN Vector<short>::operator-() const
 521 {
 522 #ifdef VC_IMPL_SSSE3
 523     return _mm_sign_epi16(d.v(), _mm_setallone_si128());
 524 #else
 525     return _mm_mullo_epi16(d.v(), _mm_setallone_si128());
 526 #endif
 527 }
 528 template<> inline Vector<short> PURE ALWAYS_INLINE FLATTEN Vector<unsigned short>::operator-() const
 529 {
 530 #ifdef VC_IMPL_SSSE3
 531     return _mm_sign_epi16(d.v(), _mm_setallone_si128());
 532 #else
 533     return _mm_mullo_epi16(d.v(), _mm_setallone_si128());
 534 #endif
 535 }
 536
 537 ///////////////////////////////////////////////////////////////////////////////////////////
 538 // integer ops {{{1
 539 #define OP_IMPL(T, symbol, fun) \
 540 template<> inline Vector<T> &Vector<T>::operator symbol##=(const Vector<T> &x) \
 541 { \
 542     d.v() = VectorHelper<T>::fun(d.v(), x.d.v()); \
 543     return *this; \
 544 } \
 545 template<> inline Vector<T>  Vector<T>::operator symbol(const Vector<T> &x) const \
 546 { \
 547     return VectorHelper<T>::fun(d.v(), x.d.v()); \
 548 }
 549 OP_IMPL(int, &, and_)
 550 OP_IMPL(int, |, or_)
 551 OP_IMPL(int, ^, xor_)
 552 OP_IMPL(unsigned int, &, and_)
 553 OP_IMPL(unsigned int, |, or_)
 554 OP_IMPL(unsigned int, ^, xor_)
 555 OP_IMPL(short, &, and_)
 556 OP_IMPL(short, |, or_)
 557 OP_IMPL(short, ^, xor_)
 558 OP_IMPL(unsigned short, &, and_)
 559 OP_IMPL(unsigned short, |, or_)
 560 OP_IMPL(unsigned short, ^, xor_)
 561 OP_IMPL(float, &, and_)
 562 OP_IMPL(float, |, or_)
 563 OP_IMPL(float, ^, xor_)
 564 OP_IMPL(float8, &, and_)
 565 OP_IMPL(float8, |, or_)
 566 OP_IMPL(float8, ^, xor_)
 567 OP_IMPL(double, &, and_)
 568 OP_IMPL(double, |, or_)
 569 OP_IMPL(double, ^, xor_)
 570 #undef OP_IMPL
 571
 572 #ifdef VC_IMPL_XOP
 573 static inline INTRINSIC CONST __m128i shiftLeft (const    int_v &value, const    int_v &count) { return _mm_sha_epi32(value.data(), count.data()); }
 574 static inline INTRINSIC CONST __m128i shiftLeft (const   uint_v &value, const   uint_v &count) { return _mm_shl_epi32(value.data(), count.data()); }
 575 static inline INTRINSIC CONST __m128i shiftLeft (const  short_v &value, const  short_v &count) { return _mm_sha_epi16(value.data(), count.data()); }
 576 static inline INTRINSIC CONST __m128i shiftLeft (const ushort_v &value, const ushort_v &count) { return _mm_shl_epi16(value.data(), count.data()); }
 577 static inline INTRINSIC CONST __m128i shiftRight(const    int_v &value, const    int_v &count) { return shiftLeft(value,          -count ); }
 578 static inline INTRINSIC CONST __m128i shiftRight(const   uint_v &value, const   uint_v &count) { return shiftLeft(value,   uint_v(-count)); }
 579 static inline INTRINSIC CONST __m128i shiftRight(const  short_v &value, const  short_v &count) { return shiftLeft(value,          -count ); }
 580 static inline INTRINSIC CONST __m128i shiftRight(const ushort_v &value, const ushort_v &count) { return shiftLeft(value, ushort_v(-count)); }
 581
 582 #define _VC_OP(T, symbol, impl) \
 583 template<> inline INTRINSIC T &T::operator symbol##=(T::AsArg shift) \
 584 { \
 585     d.v() = impl(*this, shift); \
 586     return *this; \
 587 } \
 588 template<> inline INTRINSIC T  T::operator symbol   (T::AsArg shift) const \
 589 { \
 590     return impl(*this, shift); \
 591 }
 592 VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, <<, shiftLeft)
 593 VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, >>, shiftRight)
 594 #undef _VC_OP
 595 #else
 596 #if defined(VC_GCC) && VC_GCC == 0x40600 && VC_IMPL_XOP
 597 #define VC_WORKAROUND_IN
 598 #define VC_WORKAROUND __attribute__((optimize("no-tree-vectorize"),weak))
 599 #else
 600 #define VC_WORKAROUND_IN inline
 601 #define VC_WORKAROUND INTRINSIC
 602 #endif
 603
 604 #define OP_IMPL(T, symbol) \
 605 template<> VC_WORKAROUND_IN Vector<T> VC_WORKAROUND &Vector<T>::operator symbol##=(Vector<T>::AsArg x) \
 606 { \
 607     for_all_vector_entries(i, \
 608             d.m(i) symbol##= x.d.m(i); \
 609             ); \
 610     return *this; \
 611 } \
 612 template<> inline Vector<T>  Vector<T>::operator symbol(Vector<T>::AsArg x) const \
 613 { \
 614     Vector<T> r; \
 615     for_all_vector_entries(i, \
 616             r.d.m(i) = d.m(i) symbol x.d.m(i); \
 617             ); \
 618     return r; \
 619 }
 620 OP_IMPL(int, <<)
 621 OP_IMPL(int, >>)
 622 OP_IMPL(unsigned int, <<)
 623 OP_IMPL(unsigned int, >>)
 624 OP_IMPL(short, <<)
 625 OP_IMPL(short, >>)
 626 OP_IMPL(unsigned short, <<)
 627 OP_IMPL(unsigned short, >>)
 628 #undef OP_IMPL
 629 #undef VC_WORKAROUND
 630 #undef VC_WORKAROUND_IN
 631 #endif
 632
 633 template<typename T> inline Vector<T> &Vector<T>::operator>>=(int shift) {
 634     d.v() = VectorHelper<T>::shiftRight(d.v(), shift);
 635     return *this;
 636 }
 637 template<typename T> inline Vector<T> Vector<T>::operator>>(int shift) const {
 638     return VectorHelper<T>::shiftRight(d.v(), shift);
 639 }
 640 template<typename T> inline Vector<T> &Vector<T>::operator<<=(int shift) {
 641     d.v() = VectorHelper<T>::shiftLeft(d.v(), shift);
 642     return *this;
 643 }
 644 template<typename T> inline Vector<T> Vector<T>::operator<<(int shift) const {
 645     return VectorHelper<T>::shiftLeft(d.v(), shift);
 646 }
 647
 648 ///////////////////////////////////////////////////////////////////////////////////////////
 649 // swizzles {{{1
 650 template<typename T> inline const Vector<T> INTRINSIC CONST &Vector<T>::abcd() const { return *this; }
 651 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); }
 652 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
 653 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); }
 654 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); }
 655 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); }
 656 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); }
 657 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); }
 658 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); }
 659 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); }
 660 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); }
 661 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); }
 662 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); }
 663
 664 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::cdab() const { return M256::create(Mem::permute<X2, X3, X0, X1>(d.v()[0]), Mem::permute<X2, X3, X0, X1>(d.v()[1])); }
 665 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::badc() const { return M256::create(Mem::permute<X1, X0, X3, X2>(d.v()[0]), Mem::permute<X1, X0, X3, X2>(d.v()[1])); }
 666 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::aaaa() const { return M256::create(Mem::permute<X0, X0, X0, X0>(d.v()[0]), Mem::permute<X0, X0, X0, X0>(d.v()[1])); }
 667 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::bbbb() const { return M256::create(Mem::permute<X1, X1, X1, X1>(d.v()[0]), Mem::permute<X1, X1, X1, X1>(d.v()[1])); }
 668 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::cccc() const { return M256::create(Mem::permute<X2, X2, X2, X2>(d.v()[0]), Mem::permute<X2, X2, X2, X2>(d.v()[1])); }
 669 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::dddd() const { return M256::create(Mem::permute<X3, X3, X3, X3>(d.v()[0]), Mem::permute<X3, X3, X3, X3>(d.v()[1])); }
 670 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::bcad() const { return M256::create(Mem::permute<X1, X2, X0, X3>(d.v()[0]), Mem::permute<X1, X2, X0, X3>(d.v()[1])); }
 671 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::bcda() const { return M256::create(Mem::permute<X1, X2, X3, X0>(d.v()[0]), Mem::permute<X1, X2, X3, X0>(d.v()[1])); }
 672 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::dabc() const { return M256::create(Mem::permute<X3, X0, X1, X2>(d.v()[0]), Mem::permute<X3, X0, X1, X2>(d.v()[1])); }
 673 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::acbd() const { return M256::create(Mem::permute<X0, X2, X1, X3>(d.v()[0]), Mem::permute<X0, X2, X1, X3>(d.v()[1])); }
 674 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::dbca() const { return M256::create(Mem::permute<X3, X1, X2, X0>(d.v()[0]), Mem::permute<X3, X1, X2, X0>(d.v()[1])); }
 675 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::dcba() const { return M256::create(Mem::permute<X3, X2, X1, X0>(d.v()[0]), Mem::permute<X3, X2, X1, X0>(d.v()[1])); }
 676
 677 #define VC_SWIZZLES_16BIT_IMPL(T) \
 678 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \
 679 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \
 680 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \
 681 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \
 682 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \
 683 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \
 684 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \
 685 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \
 686 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \
 687 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \
 688 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \
 689 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); }
 690 VC_SWIZZLES_16BIT_IMPL(short)
 691 VC_SWIZZLES_16BIT_IMPL(unsigned short)
 692 #undef VC_SWIZZLES_16BIT_IMPL
 693
 694 // operators {{{1
 695 #include "../common/operators.h"
 696 // }}}1
 697 // gathers {{{1
 698 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes)
 699 {
 700     gather(mem, indexes);
 701 }
 702 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const Vector<IndexT> indexes)
 703 {
 704     gather(mem, indexes);
 705 }
 706
 707 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask)
 708     : d(HT::zero())
 709 {
 710     gather(mem, indexes, mask);
 711 }
 712
 713 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const Vector<IndexT> indexes, MaskArg mask)
 714     : d(HT::zero())
 715 {
 716     gather(mem, indexes, mask);
 717 }
 718
 719 template<typename T> template<typename S1, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, const IT indexes)
 720 {
 721     gather(array, member1, indexes);
 722 }
 723 template<typename T> template<typename S1, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, const IT indexes, MaskArg mask)
 724     : d(HT::zero())
 725 {
 726     gather(array, member1, indexes, mask);
 727 }
 728 template<typename T> template<typename S1, typename S2, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
 729 {
 730     gather(array, member1, member2, indexes);
 731 }
 732 template<typename T> template<typename S1, typename S2, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes, MaskArg mask)
 733     : d(HT::zero())
 734 {
 735     gather(array, member1, member2, indexes, mask);
 736 }
 737 template<typename T> template<typename S1, typename IT1, typename IT2> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
 738 {
 739     gather(array, ptrMember1, outerIndexes, innerIndexes);
 740 }
 741 template<typename T> template<typename S1, typename IT1, typename IT2> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes, MaskArg mask)
 742     : d(HT::zero())
 743 {
 744     gather(array, ptrMember1, outerIndexes, innerIndexes, mask);
 745 }
 746
 747 template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} };
 748 template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size>
 749 {
 750     static void check() {
 751         VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries);
 752     }
 753 };
 754 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const EntryType *mem, const Index indexes)
 755 {
 756     IndexSizeChecker<Index, Size>::check();
 757     d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]);
 758 }
 759 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const EntryType *mem, const Index indexes)
 760 {
 761     IndexSizeChecker<Index, Size>::check();
 762     d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 763 }
 764 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<float8>::gather(const EntryType *mem, const Index indexes)
 765 {
 766     IndexSizeChecker<Index, Size>::check();
 767     d.v()[0] = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 768     d.v()[1] = _mm_setr_ps(mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 769 }
 770 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const EntryType *mem, const Index indexes)
 771 {
 772     IndexSizeChecker<Index, Size>::check();
 773     d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 774 }
 775 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const EntryType *mem, const Index indexes)
 776 {
 777     IndexSizeChecker<Index, Size>::check();
 778     d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 779 }
 780 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const EntryType *mem, const Index indexes)
 781 {
 782     IndexSizeChecker<Index, Size>::check();
 783     d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 784             mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 785 }
 786 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const EntryType *mem, const Index indexes)
 787 {
 788     IndexSizeChecker<Index, Size>::check();
 789     d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 790                 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 791 }
 792
 793 #ifdef VC_USE_SET_GATHERS
 794 template<typename T> template<typename IT> inline void ALWAYS_INLINE Vector<T>::gather(const EntryType *mem, Vector<IT> indexes, MaskArg mask)
 795 {
 796     IndexSizeChecker<Vector<IT>, Size>::check();
 797     indexes.setZero(!static_cast<typename Vector<IT>::Mask>(mask));
 798     (*this)(mask) = Vector<T>(mem, indexes);
 799 }
 800 #endif
 801
 802 #ifdef VC_USE_BSF_GATHERS
 803 #define VC_MASKED_GATHER                        \
 804     int bits = mask.toInt();                    \
 805     while (bits) {                              \
 806         const int i = _bit_scan_forward(bits);  \
 807         bits &= ~(1 << i); /* btr? */           \
 808         d.m(i) = ith_value(i);                  \
 809     }
 810 #elif defined(VC_USE_POPCNT_BSF_GATHERS)
 811 #define VC_MASKED_GATHER                        \
 812     unsigned int bits = mask.toInt();           \
 813     unsigned int low, high = 0;                 \
 814     switch (mask.count()) {             \
 815     case 8:                                     \
 816         high = _bit_scan_reverse(bits);         \
 817         d.m(high) = ith_value(high);            \
 818         high = (1 << high);                     \
 819     case 7:                                     \
 820         low = _bit_scan_forward(bits);          \
 821         bits ^= high | (1 << low);              \
 822         d.m(low) = ith_value(low);              \
 823     case 6:                                     \
 824         high = _bit_scan_reverse(bits);         \
 825         d.m(high) = ith_value(high);            \
 826         high = (1 << high);                     \
 827     case 5:                                     \
 828         low = _bit_scan_forward(bits);          \
 829         bits ^= high | (1 << low);              \
 830         d.m(low) = ith_value(low);              \
 831     case 4:                                     \
 832         high = _bit_scan_reverse(bits);         \
 833         d.m(high) = ith_value(high);            \
 834         high = (1 << high);                     \
 835     case 3:                                     \
 836         low = _bit_scan_forward(bits);          \
 837         bits ^= high | (1 << low);              \
 838         d.m(low) = ith_value(low);              \
 839     case 2:                                     \
 840         high = _bit_scan_reverse(bits);         \
 841         d.m(high) = ith_value(high);            \
 842     case 1:                                     \
 843         low = _bit_scan_forward(bits);          \
 844         d.m(low) = ith_value(low);              \
 845     case 0:                                     \
 846         break;                                  \
 847     }
 848 #else
 849 #define VC_MASKED_GATHER                        \
 850     if (mask.isEmpty()) {                       \
 851         return;                                 \
 852     }                                           \
 853     for_all_vector_entries(i,                   \
 854             if (mask[i]) d.m(i) = ith_value(i); \
 855             );
 856 #endif
 857
 858 template<typename T> template<typename Index>
 859 inline void INTRINSIC Vector<T>::gather(const EntryType *mem, Index indexes, MaskArg mask)
 860 {
 861     IndexSizeChecker<Index, Size>::check();
 862 #define ith_value(_i_) (mem[indexes[_i_]])
 863     VC_MASKED_GATHER
 864 #undef ith_value
 865 }
 866
 867 template<> template<typename S1, typename IT>
 868 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
 869 {
 870     IndexSizeChecker<IT, Size>::check();
 871     d.v() = _mm_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1));
 872 }
 873 template<> template<typename S1, typename IT>
 874 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
 875 {
 876     IndexSizeChecker<IT, Size>::check();
 877     d.v() = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 878             array[indexes[3]].*(member1));
 879 }
 880 template<> template<typename S1, typename IT>
 881 inline void ALWAYS_INLINE FLATTEN Vector<float8>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
 882 {
 883     IndexSizeChecker<IT, Size>::check();
 884     d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 885             array[indexes[3]].*(member1));
 886     d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1),
 887             array[indexes[7]].*(member1));
 888 }
 889 template<> template<typename S1, typename IT>
 890 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
 891 {
 892     IndexSizeChecker<IT, Size>::check();
 893     d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 894             array[indexes[3]].*(member1));
 895 }
 896 template<> template<typename S1, typename IT>
 897 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
 898 {
 899     IndexSizeChecker<IT, Size>::check();
 900     d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 901             array[indexes[3]].*(member1));
 902 }
 903 template<> template<typename S1, typename IT>
 904 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
 905 {
 906     IndexSizeChecker<IT, Size>::check();
 907     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 908             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 909             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 910 }
 911 template<> template<typename S1, typename IT>
 912 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
 913 {
 914     IndexSizeChecker<IT, Size>::check();
 915     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 916             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 917             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 918 }
 919 template<typename T> template<typename S1, typename IT>
 920 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes, MaskArg mask)
 921 {
 922     IndexSizeChecker<IT, Size>::check();
 923 #define ith_value(_i_) (array[indexes[_i_]].*(member1))
 924     VC_MASKED_GATHER
 925 #undef ith_value
 926 }
 927 template<> template<typename S1, typename S2, typename IT>
 928 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
 929 {
 930     IndexSizeChecker<IT, Size>::check();
 931     d.v() = _mm_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2));
 932 }
 933 template<> template<typename S1, typename S2, typename IT>
 934 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
 935 {
 936     IndexSizeChecker<IT, Size>::check();
 937     d.v() = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 938             array[indexes[3]].*(member1).*(member2));
 939 }
 940 template<> template<typename S1, typename S2, typename IT>
 941 inline void ALWAYS_INLINE FLATTEN Vector<float8>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
 942 {
 943     IndexSizeChecker<IT, Size>::check();
 944     d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
 945             array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
 946     d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 947             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 948 }
 949 template<> template<typename S1, typename S2, typename IT>
 950 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
 951 {
 952     IndexSizeChecker<IT, Size>::check();
 953     d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
 954             array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
 955 }
 956 template<> template<typename S1, typename S2, typename IT>
 957 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
 958 {
 959     IndexSizeChecker<IT, Size>::check();
 960     d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
 961             array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
 962 }
 963 template<> template<typename S1, typename S2, typename IT>
 964 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
 965 {
 966     IndexSizeChecker<IT, Size>::check();
 967     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 968             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 969             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 970 }
 971 template<> template<typename S1, typename S2, typename IT>
 972 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
 973 {
 974     IndexSizeChecker<IT, Size>::check();
 975     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 976             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 977             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 978 }
 979 template<typename T> template<typename S1, typename S2, typename IT>
 980 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes, MaskArg mask)
 981 {
 982     IndexSizeChecker<IT, Size>::check();
 983 #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2))
 984     VC_MASKED_GATHER
 985 #undef ith_value
 986 }
 987 template<> template<typename S1, typename IT1, typename IT2>
 988 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
 989 {
 990     IndexSizeChecker<IT1, Size>::check();
 991     IndexSizeChecker<IT2, Size>::check();
 992     d.v() = _mm_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]],
 993             (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]]);
 994 }
 995 template<> template<typename S1, typename IT1, typename IT2>
 996 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
 997 {
 998     IndexSizeChecker<IT1, Size>::check();
 999     IndexSizeChecker<IT2, Size>::check();
1000     d.v() = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1001             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1002 }
1003 template<> template<typename S1, typename IT1, typename IT2>
1004 inline void ALWAYS_INLINE FLATTEN Vector<float8>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1005 {
1006     IndexSizeChecker<IT1, Size>::check();
1007     IndexSizeChecker<IT2, Size>::check();
1008     d.v()[0] = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1009             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1010     d.v()[1] = _mm_setr_ps((array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1011             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1012 }
1013 template<> template<typename S1, typename IT1, typename IT2>
1014 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1015 {
1016     IndexSizeChecker<IT1, Size>::check();
1017     IndexSizeChecker<IT2, Size>::check();
1018     d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1019             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1020 }
1021 template<> template<typename S1, typename IT1, typename IT2>
1022 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1023 {
1024     IndexSizeChecker<IT1, Size>::check();
1025     IndexSizeChecker<IT2, Size>::check();
1026     d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1027             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1028 }
1029 template<> template<typename S1, typename IT1, typename IT2>
1030 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1031 {
1032     IndexSizeChecker<IT1, Size>::check();
1033     IndexSizeChecker<IT2, Size>::check();
1034     d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1035             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
1036             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1037             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1038 }
1039 template<> template<typename S1, typename IT1, typename IT2>
1040 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1041 {
1042     IndexSizeChecker<IT1, Size>::check();
1043     IndexSizeChecker<IT2, Size>::check();
1044     d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1045             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
1046             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1047             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1048 }
1049 template<typename T> template<typename S1, typename IT1, typename IT2>
1050 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes, MaskArg mask)
1051 {
1052     IndexSizeChecker<IT1, Size>::check();
1053     IndexSizeChecker<IT2, Size>::check();
1054 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1055     VC_MASKED_GATHER
1056 #undef ith_value
1057 }
1058 // scatters {{{1
1059 #undef VC_MASKED_GATHER
1060 #ifdef VC_USE_BSF_SCATTERS
1061 #define VC_MASKED_SCATTER                       \
1062     int bits = mask.toInt();                    \
1063     while (bits) {                              \
1064         const int i = _bit_scan_forward(bits);  \
1065         bits ^= (1 << i); /* btr? */            \
1066         ith_value(i) = d.m(i);                  \
1067     }
1068 #elif defined(VC_USE_POPCNT_BSF_SCATTERS)
1069 #define VC_MASKED_SCATTER                       \
1070     unsigned int bits = mask.toInt();           \
1071     unsigned int low, high = 0;                 \
1072     switch (mask.count()) {             \
1073     case 8:                                     \
1074         high = _bit_scan_reverse(bits);         \
1075         ith_value(high) = d.m(high);            \
1076         high = (1 << high);                     \
1077     case 7:                                     \
1078         low = _bit_scan_forward(bits);          \
1079         bits ^= high | (1 << low);              \
1080         ith_value(low) = d.m(low);              \
1081     case 6:                                     \
1082         high = _bit_scan_reverse(bits);         \
1083         ith_value(high) = d.m(high);            \
1084         high = (1 << high);                     \
1085     case 5:                                     \
1086         low = _bit_scan_forward(bits);          \
1087         bits ^= high | (1 << low);              \
1088         ith_value(low) = d.m(low);              \
1089     case 4:                                     \
1090         high = _bit_scan_reverse(bits);         \
1091         ith_value(high) = d.m(high);            \
1092         high = (1 << high);                     \
1093     case 3:                                     \
1094         low = _bit_scan_forward(bits);          \
1095         bits ^= high | (1 << low);              \
1096         ith_value(low) = d.m(low);              \
1097     case 2:                                     \
1098         high = _bit_scan_reverse(bits);         \
1099         ith_value(high) = d.m(high);            \
1100     case 1:                                     \
1101         low = _bit_scan_forward(bits);          \
1102         ith_value(low) = d.m(low);              \
1103     case 0:                                     \
1104         break;                                  \
1105     }
1106 #else
1107 #define VC_MASKED_SCATTER                       \
1108     if (mask.isEmpty()) {                       \
1109         return;                                 \
1110     }                                           \
1111     for_all_vector_entries(i,                   \
1112             if (mask[i]) ith_value(i) = d.m(i); \
1113             );
1114 #endif
1115
1116 template<typename T> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(EntryType *mem, const Index indexes) const
1117 {
1118     for_all_vector_entries(i,
1119             mem[indexes[i]] = d.m(i);
1120             );
1121 }
1122 template<typename T> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(EntryType *mem, const Index indexes, MaskArg mask) const
1123 {
1124 #define ith_value(_i_) mem[indexes[_i_]]
1125     VC_MASKED_SCATTER
1126 #undef ith_value
1127 }
1128 template<typename T> template<typename S1, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, const IT indexes) const
1129 {
1130     for_all_vector_entries(i,
1131             array[indexes[i]].*(member1) = d.m(i);
1132             );
1133 }
1134 template<typename T> template<typename S1, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, const IT indexes, MaskArg mask) const
1135 {
1136 #define ith_value(_i_) array[indexes[_i_]].*(member1)
1137     VC_MASKED_SCATTER
1138 #undef ith_value
1139 }
1140 template<typename T> template<typename S1, typename S2, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, const IT indexes) const
1141 {
1142     for_all_vector_entries(i,
1143             array[indexes[i]].*(member1).*(member2) = d.m(i);
1144             );
1145 }
1146 template<typename T> template<typename S1, typename S2, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, const IT indexes, MaskArg mask) const
1147 {
1148 #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2)
1149     VC_MASKED_SCATTER
1150 #undef ith_value
1151 }
1152 template<typename T> template<typename S1, typename IT1, typename IT2> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes) const
1153 {
1154     for_all_vector_entries(i,
1155             (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i);
1156             );
1157 }
1158 template<typename T> template<typename S1, typename IT1, typename IT2> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes, MaskArg mask) const
1159 {
1160 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1161     VC_MASKED_SCATTER
1162 #undef ith_value
1163 }
1164
1165 ///////////////////////////////////////////////////////////////////////////////////////////
1166 // operator[] {{{1
1167 template<typename T> inline typename Vector<T>::EntryType PURE INTRINSIC Vector<T>::operator[](size_t index) const
1168 {
1169     return d.m(index);
1170 }
1171 #ifdef VC_GCC
1172 template<> inline double PURE INTRINSIC Vector<double>::operator[](size_t index) const
1173 {
1174     if (__builtin_constant_p(index)) {
1175         return extract_double_imm(d.v(), index);
1176     }
1177     return d.m(index);
1178 }
1179 template<> inline float PURE INTRINSIC Vector<float>::operator[](size_t index) const
1180 {
1181     return extract_float(d.v(), index);
1182 }
1183 template<> inline float PURE INTRINSIC Vector<float8>::operator[](size_t index) const
1184 {
1185     if (__builtin_constant_p(index)) {
1186         if (index < 4) {
1187             return extract_float_imm(d.v()[0], index);
1188         }
1189         return extract_float_imm(d.v()[1], index - 4);
1190     }
1191     return d.m(index);
1192 }
1193 template<> inline int PURE INTRINSIC Vector<int>::operator[](size_t index) const
1194 {
1195     if (__builtin_constant_p(index)) {
1196 #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following
1197 #ifdef __x86_64__
1198         if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull;
1199         if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32;
1200 #else
1201         if (index == 0) return _mm_cvtsi128_si32(d.v());
1202 #endif
1203 #endif
1204 #ifdef VC_IMPL_SSE4_1
1205         return _mm_extract_epi32(d.v(), index);
1206 #else
1207         return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4));
1208 #endif
1209     }
1210     return d.m(index);
1211 }
1212 template<> inline unsigned int PURE INTRINSIC Vector<unsigned int>::operator[](size_t index) const
1213 {
1214     if (__builtin_constant_p(index)) {
1215 #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following
1216 #ifdef __x86_64__
1217         if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull;
1218         if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32;
1219 #else
1220         if (index == 0) return _mm_cvtsi128_si32(d.v());
1221 #endif
1222 #endif
1223 #ifdef VC_IMPL_SSE4_1
1224         return _mm_extract_epi32(d.v(), index);
1225 #else
1226         return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4));
1227 #endif
1228     }
1229     return d.m(index);
1230 }
1231 template<> inline short PURE INTRINSIC Vector<short>::operator[](size_t index) const
1232 {
1233     if (__builtin_constant_p(index)) {
1234         return _mm_extract_epi16(d.v(), index);
1235     }
1236     return d.m(index);
1237 }
1238 template<> inline unsigned short PURE INTRINSIC Vector<unsigned short>::operator[](size_t index) const
1239 {
1240     if (__builtin_constant_p(index)) {
1241         return _mm_extract_epi16(d.v(), index);
1242     }
1243     return d.m(index);
1244 }
1245 #endif // GCC
1246 ///////////////////////////////////////////////////////////////////////////////////////////
1247 // horizontal ops {{{1
1248 #ifndef VC_IMPL_SSE4_1
1249 // without SSE4.1 integer multiplication is slow and we rather multiply the scalars
1250 template<> inline int INTRINSIC Vector<int>::product() const
1251 {
1252     return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
1253 }
1254 template<> inline unsigned int INTRINSIC Vector<unsigned int>::product() const
1255 {
1256     return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
1257 }
1258 #endif
1259 template<typename T> inline typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const
1260 {
1261     Vector<T> tmp = std::numeric_limits<Vector<T> >::max();
1262     tmp(m) = *this;
1263     return tmp.min();
1264 }
1265 template<typename T> inline typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const
1266 {
1267     Vector<T> tmp = std::numeric_limits<Vector<T> >::min();
1268     tmp(m) = *this;
1269     return tmp.max();
1270 }
1271 template<typename T> inline typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const
1272 {
1273     Vector<T> tmp(VectorSpecialInitializerOne::One);
1274     tmp(m) = *this;
1275     return tmp.product();
1276 }
1277 template<typename T> inline typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const
1278 {
1279     Vector<T> tmp(VectorSpecialInitializerZero::Zero);
1280     tmp(m) = *this;
1281     return tmp.sum();
1282 }
1283
1284 ///////////////////////////////////////////////////////////////////////////////////////////
1285 // copySign {{{1
1286 template<> inline Vector<float> INTRINSIC Vector<float>::copySign(Vector<float>::AsArg reference) const
1287 {
1288     return _mm_or_ps(
1289             _mm_and_ps(reference.d.v(), _mm_setsignmask_ps()),
1290             _mm_and_ps(d.v(), _mm_setabsmask_ps())
1291             );
1292 }
1293 template<> inline Vector<float8> INTRINSIC Vector<float8>::copySign(Vector<float8>::AsArg reference) const
1294 {
1295     return M256::create( _mm_or_ps(
1296                 _mm_and_ps(reference.d.v()[0], _mm_setsignmask_ps()),
1297                 _mm_and_ps(d.v()[0], _mm_setabsmask_ps())
1298                 ), _mm_or_ps(
1299                 _mm_and_ps(reference.d.v()[1], _mm_setsignmask_ps()),
1300                 _mm_and_ps(d.v()[1], _mm_setabsmask_ps())
1301                 )
1302             );
1303 }
1304 template<> inline Vector<double> INTRINSIC Vector<double>::copySign(Vector<double>::AsArg reference) const
1305 {
1306     return _mm_or_pd(
1307             _mm_and_pd(reference.d.v(), _mm_setsignmask_pd()),
1308             _mm_and_pd(d.v(), _mm_setabsmask_pd())
1309             );
1310 }//}}}1
1311 // exponent {{{1
1312 template<> inline Vector<float> INTRINSIC Vector<float>::exponent() const
1313 {
1314     VC_ASSERT((*this > 0.f).isFull());
1315     __m128i tmp = _mm_srli_epi32(_mm_castps_si128(d.v()), 23);
1316     tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
1317     return _mm_cvtepi32_ps(tmp);
1318 }
1319 template<> inline Vector<float8> INTRINSIC Vector<float8>::exponent() const
1320 {
1321     VC_ASSERT((*this > 0.f).isFull());
1322     __m128i tmp0 = _mm_srli_epi32(_mm_castps_si128(d.v()[0]), 23);
1323     __m128i tmp1 = _mm_srli_epi32(_mm_castps_si128(d.v()[1]), 23);
1324     tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
1325     tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
1326     return M256::create( _mm_cvtepi32_ps(tmp0), _mm_cvtepi32_ps(tmp1));
1327 }
1328 template<> inline Vector<double> INTRINSIC Vector<double>::exponent() const
1329 {
1330     VC_ASSERT((*this > 0.).isFull());
1331     __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(d.v()), 52);
1332     tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
1333     return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
1334 }
1335 // }}}1
1336 // Random {{{1
1337 static inline ALWAYS_INLINE void _doRandomStep(Vector<unsigned int> &state0,
1338         Vector<unsigned int> &state1)
1339 {
1340     state0.load(&Vc::RandomState[0]);
1341     state1.load(&Vc::RandomState[uint_v::Size]);
1342     (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]);
1343     uint_v(_mm_xor_si128((state0 * 0xdeece66du + 11).data(), _mm_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]);
1344 }
1345
1346 template<typename T> inline ALWAYS_INLINE Vector<T> Vector<T>::Random()
1347 {
1348     Vector<unsigned int> state0, state1;
1349     _doRandomStep(state0, state1);
1350     return state0.reinterpretCast<Vector<T> >();
1351 }
1352
1353 template<> inline ALWAYS_INLINE Vector<float> Vector<float>::Random()
1354 {
1355     Vector<unsigned int> state0, state1;
1356     _doRandomStep(state0, state1);
1357     return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1358 }
1359
1360 template<> inline ALWAYS_INLINE Vector<float8> Vector<float8>::Random()
1361 {
1362     Vector<unsigned int> state0, state1;
1363     _doRandomStep(state0, state1);
1364     state1 ^= state0 >> 16;
1365     return M256::create(
1366             _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one()),
1367             _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state1.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one())
1368             );
1369 }
1370
1371 template<> inline ALWAYS_INLINE Vector<double> Vector<double>::Random()
1372 {
1373     typedef unsigned long long uint64 MAY_ALIAS;
1374     uint64 state0 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[8]);
1375     uint64 state1 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[10]);
1376     const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Vc::RandomState[8]));
1377     *reinterpret_cast<uint64 *>(&Vc::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
1378     *reinterpret_cast<uint64 *>(&Vc::RandomState[10]) = (state1 * 0x5deece66dull + 11);
1379     return (Vector<double>(_mm_castsi128_pd(_mm_srli_epi64(state, 12))) | One()) - One();
1380 }
1381 // }}}1
1382 } // namespace SSE
1383 } // namespace Vc
1384
1385 #include "undomacros.h"
1386
1387 // vim: foldmethod=marker