Vc/include/Vc/sse/vector.tcc

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2010-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #include "limits.h"
  21 #include "../common/bitscanintrinsics.h"
  22 #include "macros.h"
  23
  24 namespace AliRoot {
  25 namespace Vc
  26 {
  27 ALIGN(64) extern unsigned int RandomState[16];
  28
  29 namespace SSE
  30 {
  31
  32 template<typename T, int Size> static Vc_ALWAYS_INLINE Vc_CONST const T *_IndexesFromZero() {
  33     if (Size == 4) {
  34         return reinterpret_cast<const T *>(_IndexesFromZero4);
  35     } else if (Size == 8) {
  36         return reinterpret_cast<const T *>(_IndexesFromZero8);
  37     } else if (Size == 16) {
  38         return reinterpret_cast<const T *>(_IndexesFromZero16);
  39     }
  40     return 0;
  41 }
  42
  43 ///////////////////////////////////////////////////////////////////////////////////////////
  44 // constants {{{1
  45 template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum)
  46     : d(VectorHelper<VectorType>::zero())
  47 {
  48 }
  49
  50 template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerOne::OEnum)
  51     : d(VectorHelper<T>::one())
  52 {
  53 }
  54
  55 template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum)
  56     : d(VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned))
  57 {
  58 }
  59
  60 template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::Zero()
  61 {
  62     return VectorHelper<VectorType>::zero();
  63 }
  64
  65 template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::One()
  66 {
  67     return VectorHelper<T>::one();
  68 }
  69
  70 template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::IndexesFromZero()
  71 {
  72     return VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned);
  73 }
  74
  75 // conversion/casts {{{1
  76 template<typename T> template<typename OtherT> Vc_INTRINSIC Vector<T>::Vector(const Vector<OtherT> &x)
  77     : d(StaticCastHelper<OtherT, T>::cast(x.data()))
  78 {
  79 }
  80
  81 template<> template<> Vc_INTRINSIC short_v &Vector<short>::operator=(const ushort_v &x) {
  82     data() = StaticCastHelper<unsigned short, short>::cast(x.data()); return *this;
  83 }
  84 template<> template<> Vc_INTRINSIC ushort_v &Vector<unsigned short>::operator=(const short_v &x) {
  85     data() = StaticCastHelper<short, unsigned short>::cast(x.data()); return *this;
  86 }
  87 template<> template<> Vc_INTRINSIC int_v &Vector<int>::operator=(const uint_v &x) {
  88     data() = StaticCastHelper<unsigned int, int>::cast(x.data()); return *this;
  89 }
  90 template<> template<> Vc_INTRINSIC uint_v &Vector<unsigned int>::operator=(const int_v &x) {
  91     data() = StaticCastHelper<int, unsigned int>::cast(x.data()); return *this;
  92 }
  93
  94 // broadcasts {{{1
  95 template<typename T> Vc_INTRINSIC Vector<T>::Vector(EntryType a)
  96     : d(VectorHelper<T>::set(a))
  97 {
  98 }
  99
 100 ///////////////////////////////////////////////////////////////////////////////////////////
 101 // load ctors {{{1
 102 template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); }
 103 template<typename T> template<typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); }
 104 template<typename T> template<typename OtherT> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); }
 105 template<typename T> template<typename OtherT, typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); }
 106
 107 ///////////////////////////////////////////////////////////////////////////////////////////
 108 // load member functions {{{1
 109 template<typename T> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem)
 110 {
 111     load(mem, Aligned);
 112 }
 113
 114 template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem, A align)
 115 {
 116     d.v() = VectorHelper<VectorType>::load(mem, align);
 117 }
 118
 119 template<typename T> template<typename OtherT> Vc_INTRINSIC void Vector<T>::load(const OtherT *mem)
 120 {
 121     load(mem, Aligned);
 122 }
 123
 124 // float8: simply use the float implementation twice {{{2
 125 template<> template<typename OtherT, typename A> Vc_INTRINSIC void Vector<float8>::load(const OtherT *x, A a)
 126 {
 127     d.v() = M256::create(
 128             Vector<float>(&x[0], a).data(),
 129             Vector<float>(&x[4], a).data()
 130             );
 131 }
 132
 133 // LoadHelper {{{2
 134 template<typename DstT, typename SrcT, typename Flags> struct LoadHelper;
 135
 136 // float {{{2
 137 template<typename Flags> struct LoadHelper<float, double, Flags> {
 138     static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const double *mem, Flags f)
 139     {
 140         return _mm_movelh_ps(_mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[0], f)),
 141                              _mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[2], f)));
 142     }
 143 };
 144 template<typename Flags> struct LoadHelper<float, unsigned int, Flags> {
 145     static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned int *mem, Flags f)
 146     {
 147         return StaticCastHelper<unsigned int, float>::cast(VectorHelper<__m128i>::load(mem, f));
 148     }
 149 };
 150 template<typename Flags> struct LoadHelper<float, int, Flags> {
 151     static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const int *mem, Flags f)
 152     {
 153         return StaticCastHelper<int, float>::cast(VectorHelper<__m128i>::load(mem, f));
 154     }
 155 };
 156 template<typename Flags> struct LoadHelper<float, unsigned short, Flags> {
 157     static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned short *mem, Flags f)
 158     {
 159         return _mm_cvtepi32_ps(LoadHelper<int, unsigned short, Flags>::load(mem, f));
 160     }
 161 };
 162 template<typename Flags> struct LoadHelper<float, short, Flags> {
 163     static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const short *mem, Flags f)
 164     {
 165         return _mm_cvtepi32_ps(LoadHelper<int, short, Flags>::load(mem, f));
 166     }
 167 };
 168 template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
 169     static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned char *mem, Flags f)
 170     {
 171         return _mm_cvtepi32_ps(LoadHelper<int, unsigned char, Flags>::load(mem, f));
 172     }
 173 };
 174 template<typename Flags> struct LoadHelper<float, signed char, Flags> {
 175     static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const signed char *mem, Flags f)
 176     {
 177         return _mm_cvtepi32_ps(LoadHelper<int, signed char, Flags>::load(mem, f));
 178     }
 179 };
 180
 181 // int {{{2
 182 template<typename Flags> struct LoadHelper<int, unsigned int, Flags> {
 183     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned int *mem, Flags f)
 184     {
 185         return VectorHelper<__m128i>::load(mem, f);
 186     }
 187 };
 188 // no difference between streaming and alignment, because the
 189 // 32/64 bit loads are not available as streaming loads, and can always be unaligned
 190 template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
 191     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags)
 192     {
 193         return _mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 194     }
 195 };
 196 template<typename Flags> struct LoadHelper<int, short, Flags> {
 197     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const short *mem, Flags)
 198     {
 199         return _mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 200     }
 201 };
 202 template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
 203     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
 204     {
 205         return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
 206     }
 207 };
 208 template<typename Flags> struct LoadHelper<int, signed char, Flags> {
 209     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags)
 210     {
 211         return _mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
 212     }
 213 };
 214
 215 // unsigned int {{{2
 216 template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
 217     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags)
 218     {
 219         return _mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 220     }
 221 };
 222 template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
 223     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
 224     {
 225         return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
 226     }
 227 };
 228
 229 // short {{{2
 230 template<typename Flags> struct LoadHelper<short, unsigned short, Flags> {
 231     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags f)
 232     {
 233         return VectorHelper<__m128i>::load(mem, f);
 234     }
 235 };
 236 template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
 237     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
 238     {
 239         return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 240     }
 241 };
 242 template<typename Flags> struct LoadHelper<short, signed char, Flags> {
 243     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags)
 244     {
 245         return _mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 246     }
 247 };
 248
 249 // unsigned short {{{2
 250 template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
 251     static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
 252     {
 253         return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
 254     }
 255 };
 256
 257 // general load, implemented via LoadHelper {{{2
 258 template<typename DstT> template<typename SrcT, typename Flags> Vc_INTRINSIC void Vector<DstT>::load(const SrcT *x, Flags f)
 259 {
 260     d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f);
 261 }
 262
 263 ///////////////////////////////////////////////////////////////////////////////////////////
 264 // expand/combine {{{1
 265 template<typename T> Vc_INTRINSIC Vector<T>::Vector(const Vector<typename CtorTypeHelper<T>::Type> *a)
 266     : d(VectorHelper<T>::concat(a[0].data(), a[1].data()))
 267 {
 268 }
 269
 270 template<typename T> inline void Vector<T>::expand(Vector<typename ExpandTypeHelper<T>::Type> *x) const
 271 {
 272     if (Size == 8u) {
 273         x[0].data() = VectorHelper<T>::expand0(data());
 274         x[1].data() = VectorHelper<T>::expand1(data());
 275     }
 276 }
 277
 278 ///////////////////////////////////////////////////////////////////////////////////////////
 279 // zeroing {{{1
 280 template<typename T> Vc_INTRINSIC void Vector<T>::setZero()
 281 {
 282     data() = VectorHelper<VectorType>::zero();
 283 }
 284
 285 template<typename T> Vc_INTRINSIC void Vector<T>::setZero(const Mask &k)
 286 {
 287     data() = VectorHelper<VectorType>::andnot_(mm128_reinterpret_cast<VectorType>(k.data()), data());
 288 }
 289
 290 template<> Vc_INTRINSIC void Vector<double>::setQnan()
 291 {
 292     data() = _mm_setallone_pd();
 293 }
 294 template<> Vc_INTRINSIC void Vector<double>::setQnan(Mask::Argument k)
 295 {
 296     data() = _mm_or_pd(data(), k.dataD());
 297 }
 298 template<> Vc_INTRINSIC void Vector<float>::setQnan()
 299 {
 300     data() = _mm_setallone_ps();
 301 }
 302 template<> Vc_INTRINSIC void Vector<float>::setQnan(Mask::Argument k)
 303 {
 304     data() = _mm_or_ps(data(), k.data());
 305 }
 306 template<> Vc_INTRINSIC void Vector<float8>::setQnan()
 307 {
 308     d.v()[0] = _mm_setallone_ps();
 309     d.v()[1] = _mm_setallone_ps();
 310 }
 311 template<> Vc_INTRINSIC void Vector<float8>::setQnan(Mask::Argument k)
 312 {
 313     d.v()[0] = _mm_or_ps(d.v()[0], k.data()[0]);
 314     d.v()[1] = _mm_or_ps(d.v()[1], k.data()[1]);
 315 }
 316
 317 ///////////////////////////////////////////////////////////////////////////////////////////
 318 // stores {{{1
 319 template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem) const
 320 {
 321     VectorHelper<VectorType>::store(mem, data(), Aligned);
 322 }
 323
 324 template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask) const
 325 {
 326     VectorHelper<VectorType>::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), Aligned);
 327 }
 328
 329 template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, A align) const
 330 {
 331     VectorHelper<VectorType>::store(mem, data(), align);
 332 }
 333
 334 template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask, A align) const
 335 {
 336     HV::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), align);
 337 }
 338
 339 ///////////////////////////////////////////////////////////////////////////////////////////
 340 // division {{{1
 341 template<typename T> Vc_INTRINSIC Vector<T> &WriteMaskedVector<T>::operator/=(const Vector<T> &x)
 342 {
 343     return operator=(*vec / x);
 344 }
 345 template<> Vc_INTRINSIC int_v &WriteMaskedVector<int>::operator/=(const int_v &x)
 346 {
 347     Vc_foreach_bit (int i, mask) {
 348         vec->d.m(i) /= x.d.m(i);
 349     }
 350     return *vec;
 351 }
 352 template<> Vc_INTRINSIC uint_v &WriteMaskedVector<unsigned int>::operator/=(const uint_v &x)
 353 {
 354     Vc_foreach_bit (int i, mask) {
 355         vec->d.m(i) /= x.d.m(i);
 356     }
 357     return *vec;
 358 }
 359 template<> Vc_INTRINSIC short_v &WriteMaskedVector<short>::operator/=(const short_v &x)
 360 {
 361     Vc_foreach_bit (int i, mask) {
 362         vec->d.m(i) /= x.d.m(i);
 363     }
 364     return *vec;
 365 }
 366 template<> Vc_INTRINSIC ushort_v &WriteMaskedVector<unsigned short>::operator/=(const ushort_v &x)
 367 {
 368     Vc_foreach_bit (int i, mask) {
 369         vec->d.m(i) /= x.d.m(i);
 370     }
 371     return *vec;
 372 }
 373
 374 template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x)
 375 {
 376     if (VectorTraits<T>::HasVectorDivision) {
 377         return operator/=(Vector<T>(x));
 378     }
 379     for_all_vector_entries(i,
 380             d.m(i) /= x;
 381             );
 382     return *this;
 383 }
 384
 385 template<typename T> template<typename TT> Vc_INTRINSIC Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const
 386 {
 387     if (VectorTraits<T>::HasVectorDivision) {
 388         return operator/(Vector<T>(x));
 389     }
 390     Vector<T> r;
 391     for_all_vector_entries(i,
 392             r.d.m(i) = d.m(i) / x;
 393             );
 394     return r;
 395 }
 396
 397 template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x)
 398 {
 399     for_all_vector_entries(i,
 400             d.m(i) /= x.d.m(i);
 401             );
 402     return *this;
 403 }
 404
 405 template<typename T> inline Vc_PURE Vector<T> Vector<T>::operator/(const Vector<T> &x) const
 406 {
 407     Vector<T> r;
 408     for_all_vector_entries(i,
 409             r.d.m(i) = d.m(i) / x.d.m(i);
 410             );
 411     return r;
 412 }
 413
 414 template<> inline Vector<short> &Vector<short>::operator/=(const Vector<short> &x)
 415 {
 416     __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
 417     __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
 418     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
 419     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
 420     d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
 421     return *this;
 422 }
 423
 424 template<> inline Vc_PURE Vector<short> Vector<short>::operator/(const Vector<short> &x) const
 425 {
 426     __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
 427     __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
 428     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
 429     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
 430     return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
 431 }
 432
 433 template<> inline Vector<unsigned short> &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x)
 434 {
 435     __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
 436     __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
 437     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
 438     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
 439     d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
 440     return *this;
 441 }
 442
 443 template<> Vc_ALWAYS_INLINE Vc_PURE Vector<unsigned short> Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const
 444 {
 445     __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
 446     __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
 447     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
 448     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
 449     return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
 450 }
 451
 452 template<> Vc_ALWAYS_INLINE Vector<float> &Vector<float>::operator/=(const Vector<float> &x)
 453 {
 454     d.v() = _mm_div_ps(d.v(), x.d.v());
 455     return *this;
 456 }
 457
 458 template<> Vc_ALWAYS_INLINE Vc_PURE Vector<float> Vector<float>::operator/(const Vector<float> &x) const
 459 {
 460     return _mm_div_ps(d.v(), x.d.v());
 461 }
 462
 463 template<> Vc_ALWAYS_INLINE Vector<float8> &Vector<float8>::operator/=(const Vector<float8> &x)
 464 {
 465     d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]);
 466     d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]);
 467     return *this;
 468 }
 469
 470 template<> Vc_ALWAYS_INLINE Vc_PURE Vector<float8> Vector<float8>::operator/(const Vector<float8> &x) const
 471 {
 472     Vector<float8> r;
 473     r.d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]);
 474     r.d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]);
 475     return r;
 476 }
 477
 478 template<> Vc_ALWAYS_INLINE Vector<double> &Vector<double>::operator/=(const Vector<double> &x)
 479 {
 480     d.v() = _mm_div_pd(d.v(), x.d.v());
 481     return *this;
 482 }
 483
 484 template<> Vc_ALWAYS_INLINE Vc_PURE Vector<double> Vector<double>::operator/(const Vector<double> &x) const
 485 {
 486     return _mm_div_pd(d.v(), x.d.v());
 487 }
 488
 489 ///////////////////////////////////////////////////////////////////////////////////////////
 490 // operator- {{{1
 491 template<> Vc_ALWAYS_INLINE Vector<double> Vc_PURE Vc_FLATTEN Vector<double>::operator-() const
 492 {
 493     return _mm_xor_pd(d.v(), _mm_setsignmask_pd());
 494 }
 495 template<> Vc_ALWAYS_INLINE Vector<float> Vc_PURE Vc_FLATTEN Vector<float>::operator-() const
 496 {
 497     return _mm_xor_ps(d.v(), _mm_setsignmask_ps());
 498 }
 499 template<> Vc_ALWAYS_INLINE Vector<float8> Vc_PURE Vc_FLATTEN Vector<float8>::operator-() const
 500 {
 501     return M256::create(
 502             _mm_xor_ps(d.v()[0], _mm_setsignmask_ps()),
 503             _mm_xor_ps(d.v()[1], _mm_setsignmask_ps()));
 504 }
 505 template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<int>::operator-() const
 506 {
 507 #ifdef VC_IMPL_SSSE3
 508     return _mm_sign_epi32(d.v(), _mm_setallone_si128());
 509 #else
 510     return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32());
 511 #endif
 512 }
 513 template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<unsigned int>::operator-() const
 514 {
 515 #ifdef VC_IMPL_SSSE3
 516     return _mm_sign_epi32(d.v(), _mm_setallone_si128());
 517 #else
 518     return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32());
 519 #endif
 520 }
 521 template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<short>::operator-() const
 522 {
 523 #ifdef VC_IMPL_SSSE3
 524     return _mm_sign_epi16(d.v(), _mm_setallone_si128());
 525 #else
 526     return _mm_mullo_epi16(d.v(), _mm_setallone_si128());
 527 #endif
 528 }
 529 template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<unsigned short>::operator-() const
 530 {
 531 #ifdef VC_IMPL_SSSE3
 532     return _mm_sign_epi16(d.v(), _mm_setallone_si128());
 533 #else
 534     return _mm_mullo_epi16(d.v(), _mm_setallone_si128());
 535 #endif
 536 }
 537
 538 ///////////////////////////////////////////////////////////////////////////////////////////
 539 // integer ops {{{1
 540 #define OP_IMPL(T, symbol, fun) \
 541 template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(const Vector<T> &x) \
 542 { \
 543     d.v() = VectorHelper<T>::fun(d.v(), x.d.v()); \
 544     return *this; \
 545 } \
 546 template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T>  Vector<T>::operator symbol(const Vector<T> &x) const \
 547 { \
 548     return VectorHelper<T>::fun(d.v(), x.d.v()); \
 549 }
 550 OP_IMPL(int, &, and_)
 551 OP_IMPL(int, |, or_)
 552 OP_IMPL(int, ^, xor_)
 553 OP_IMPL(unsigned int, &, and_)
 554 OP_IMPL(unsigned int, |, or_)
 555 OP_IMPL(unsigned int, ^, xor_)
 556 OP_IMPL(short, &, and_)
 557 OP_IMPL(short, |, or_)
 558 OP_IMPL(short, ^, xor_)
 559 OP_IMPL(unsigned short, &, and_)
 560 OP_IMPL(unsigned short, |, or_)
 561 OP_IMPL(unsigned short, ^, xor_)
 562 OP_IMPL(float, &, and_)
 563 OP_IMPL(float, |, or_)
 564 OP_IMPL(float, ^, xor_)
 565 OP_IMPL(float8, &, and_)
 566 OP_IMPL(float8, |, or_)
 567 OP_IMPL(float8, ^, xor_)
 568 OP_IMPL(double, &, and_)
 569 OP_IMPL(double, |, or_)
 570 OP_IMPL(double, ^, xor_)
 571 #undef OP_IMPL
 572
 573 #ifdef VC_IMPL_XOP
 574 static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const    int_v &value, const    int_v &count) { return _mm_sha_epi32(value.data(), count.data()); }
 575 static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const   uint_v &value, const   uint_v &count) { return _mm_shl_epi32(value.data(), count.data()); }
 576 static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const  short_v &value, const  short_v &count) { return _mm_sha_epi16(value.data(), count.data()); }
 577 static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const ushort_v &value, const ushort_v &count) { return _mm_shl_epi16(value.data(), count.data()); }
 578 static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const    int_v &value, const    int_v &count) { return shiftLeft(value,          -count ); }
 579 static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const   uint_v &value, const   uint_v &count) { return shiftLeft(value,   uint_v(-count)); }
 580 static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const  short_v &value, const  short_v &count) { return shiftLeft(value,          -count ); }
 581 static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const ushort_v &value, const ushort_v &count) { return shiftLeft(value, ushort_v(-count)); }
 582
 583 #define _VC_OP(T, symbol, impl) \
 584 template<> Vc_INTRINSIC T &T::operator symbol##=(T::AsArg shift) \
 585 { \
 586     d.v() = impl(*this, shift); \
 587     return *this; \
 588 } \
 589 template<> Vc_INTRINSIC Vc_PURE T  T::operator symbol   (T::AsArg shift) const \
 590 { \
 591     return impl(*this, shift); \
 592 }
 593 VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, <<, shiftLeft)
 594 VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, >>, shiftRight)
 595 #undef _VC_OP
 596 #else
 597 #if defined(VC_GCC) && VC_GCC == 0x40600 && defined(VC_IMPL_XOP)
 598 #define VC_WORKAROUND __attribute__((optimize("no-tree-vectorize"),weak))
 599 #else
 600 #define VC_WORKAROUND Vc_INTRINSIC
 601 #endif
 602
 603 #define OP_IMPL(T, symbol) \
 604 template<> VC_WORKAROUND Vector<T> &Vector<T>::operator symbol##=(Vector<T>::AsArg x) \
 605 { \
 606     for_all_vector_entries(i, \
 607             d.m(i) symbol##= x.d.m(i); \
 608             ); \
 609     return *this; \
 610 } \
 611 template<> inline Vc_PURE Vector<T>  Vector<T>::operator symbol(Vector<T>::AsArg x) const \
 612 { \
 613     Vector<T> r; \
 614     for_all_vector_entries(i, \
 615             r.d.m(i) = d.m(i) symbol x.d.m(i); \
 616             ); \
 617     return r; \
 618 }
 619 OP_IMPL(int, <<)
 620 OP_IMPL(int, >>)
 621 OP_IMPL(unsigned int, <<)
 622 OP_IMPL(unsigned int, >>)
 623 OP_IMPL(short, <<)
 624 OP_IMPL(short, >>)
 625 OP_IMPL(unsigned short, <<)
 626 OP_IMPL(unsigned short, >>)
 627 #undef OP_IMPL
 628 #undef VC_WORKAROUND
 629 #endif
 630
 631 template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator>>=(int shift) {
 632     d.v() = VectorHelper<T>::shiftRight(d.v(), shift);
 633     return *this;
 634 }
 635 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator>>(int shift) const {
 636     return VectorHelper<T>::shiftRight(d.v(), shift);
 637 }
 638 template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator<<=(int shift) {
 639     d.v() = VectorHelper<T>::shiftLeft(d.v(), shift);
 640     return *this;
 641 }
 642 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator<<(int shift) const {
 643     return VectorHelper<T>::shiftLeft(d.v(), shift);
 644 }
 645
 646 ///////////////////////////////////////////////////////////////////////////////////////////
 647 // swizzles {{{1
 648 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> &Vector<T>::abcd() const { return *this; }
 649 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); }
 650 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
 651 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); }
 652 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); }
 653 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); }
 654 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); }
 655 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); }
 656 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); }
 657 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); }
 658 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); }
 659 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); }
 660 template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T>  Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); }
 661
 662 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::cdab() const { return M256::create(Mem::permute<X2, X3, X0, X1>(d.v()[0]), Mem::permute<X2, X3, X0, X1>(d.v()[1])); }
 663 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::badc() const { return M256::create(Mem::permute<X1, X0, X3, X2>(d.v()[0]), Mem::permute<X1, X0, X3, X2>(d.v()[1])); }
 664 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::aaaa() const { return M256::create(Mem::permute<X0, X0, X0, X0>(d.v()[0]), Mem::permute<X0, X0, X0, X0>(d.v()[1])); }
 665 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bbbb() const { return M256::create(Mem::permute<X1, X1, X1, X1>(d.v()[0]), Mem::permute<X1, X1, X1, X1>(d.v()[1])); }
 666 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::cccc() const { return M256::create(Mem::permute<X2, X2, X2, X2>(d.v()[0]), Mem::permute<X2, X2, X2, X2>(d.v()[1])); }
 667 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dddd() const { return M256::create(Mem::permute<X3, X3, X3, X3>(d.v()[0]), Mem::permute<X3, X3, X3, X3>(d.v()[1])); }
 668 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bcad() const { return M256::create(Mem::permute<X1, X2, X0, X3>(d.v()[0]), Mem::permute<X1, X2, X0, X3>(d.v()[1])); }
 669 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bcda() const { return M256::create(Mem::permute<X1, X2, X3, X0>(d.v()[0]), Mem::permute<X1, X2, X3, X0>(d.v()[1])); }
 670 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dabc() const { return M256::create(Mem::permute<X3, X0, X1, X2>(d.v()[0]), Mem::permute<X3, X0, X1, X2>(d.v()[1])); }
 671 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::acbd() const { return M256::create(Mem::permute<X0, X2, X1, X3>(d.v()[0]), Mem::permute<X0, X2, X1, X3>(d.v()[1])); }
 672 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dbca() const { return M256::create(Mem::permute<X3, X1, X2, X0>(d.v()[0]), Mem::permute<X3, X1, X2, X0>(d.v()[1])); }
 673 template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dcba() const { return M256::create(Mem::permute<X3, X2, X1, X0>(d.v()[0]), Mem::permute<X3, X2, X1, X0>(d.v()[1])); }
 674
 675 #define VC_SWIZZLES_16BIT_IMPL(T) \
 676 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \
 677 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \
 678 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \
 679 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \
 680 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \
 681 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \
 682 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \
 683 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \
 684 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \
 685 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \
 686 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \
 687 template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); }
 688 VC_SWIZZLES_16BIT_IMPL(short)
 689 VC_SWIZZLES_16BIT_IMPL(unsigned short)
 690 #undef VC_SWIZZLES_16BIT_IMPL
 691
 692 // operators {{{1
 693 #include "../common/operators.h"
 694 // isNegative {{{1
 695 template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const
 696 {
 697     return sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v())), 31));
 698 }
 699 template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const
 700 {
 701     return M256::create(
 702             sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[0])), 31)),
 703             sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[1])), 31))
 704             );
 705 }
 706 template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const
 707 {
 708     return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(
 709                 _mm_srai_epi32(sse_cast<__m128i>(_mm_and_pd(_mm_setsignmask_pd(), d.v())), 31)
 710                 ));
 711 }
 712 // gathers {{{1
 713 template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes)
 714 {
 715     gather(mem, indexes);
 716 }
 717 template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes)
 718 {
 719     gather(mem, indexes);
 720 }
 721
 722 template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask)
 723     : d(HT::zero())
 724 {
 725     gather(mem, indexes, mask);
 726 }
 727
 728 template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes, MaskArg mask)
 729     : d(HT::zero())
 730 {
 731     gather(mem, indexes, mask);
 732 }
 733
 734 template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
 735 {
 736     gather(array, member1, indexes);
 737 }
 738 template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
 739     : d(HT::zero())
 740 {
 741     gather(array, member1, indexes, mask);
 742 }
 743 template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
 744 {
 745     gather(array, member1, member2, indexes);
 746 }
 747 template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
 748     : d(HT::zero())
 749 {
 750     gather(array, member1, member2, indexes, mask);
 751 }
 752 template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
 753 {
 754     gather(array, ptrMember1, outerIndexes, innerIndexes);
 755 }
 756 template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask)
 757     : d(HT::zero())
 758 {
 759     gather(array, ptrMember1, outerIndexes, innerIndexes, mask);
 760 }
 761
 762 template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} };
 763 template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size>
 764 {
 765     static void check() {
 766         VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries);
 767     }
 768 };
 769 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
 770 {
 771     IndexSizeChecker<Index, Size>::check();
 772     d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]);
 773 }
 774 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
 775 {
 776     IndexSizeChecker<Index, Size>::check();
 777     d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 778 }
 779 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
 780 {
 781     IndexSizeChecker<Index, Size>::check();
 782     d.v()[0] = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 783     d.v()[1] = _mm_setr_ps(mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 784 }
 785 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
 786 {
 787     IndexSizeChecker<Index, Size>::check();
 788     d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 789 }
 790 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
 791 {
 792     IndexSizeChecker<Index, Size>::check();
 793     d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 794 }
 795 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
 796 {
 797     IndexSizeChecker<Index, Size>::check();
 798     d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 799             mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 800 }
 801 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
 802 {
 803     IndexSizeChecker<Index, Size>::check();
 804     d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 805                 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 806 }
 807
 808 #ifdef VC_USE_SET_GATHERS
 809 template<typename T> template<typename IT> Vc_ALWAYS_INLINE void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IT>) indexes, MaskArg mask)
 810 {
 811     IndexSizeChecker<Vector<IT>, Size>::check();
 812     Vector<IT> indexesTmp = indexes;
 813     indexesTmp.setZero(!static_cast<typename Vector<IT>::Mask>(mask));
 814     (*this)(mask) = Vector<T>(mem, indexesTmp);
 815 }
 816 #endif
 817
 818 #ifdef VC_USE_BSF_GATHERS
 819 #define VC_MASKED_GATHER                        \
 820     int bits = mask.toInt();                    \
 821     while (bits) {                              \
 822         const int i = _bit_scan_forward(bits);  \
 823         bits &= ~(1 << i); /* btr? */           \
 824         d.m(i) = ith_value(i);                  \
 825     }
 826 #elif defined(VC_USE_POPCNT_BSF_GATHERS)
 827 #define VC_MASKED_GATHER                        \
 828     unsigned int bits = mask.toInt();           \
 829     unsigned int low, high = 0;                 \
 830     switch (mask.count()) {             \
 831     case 8:                                     \
 832         high = _bit_scan_reverse(bits);         \
 833         d.m(high) = ith_value(high);            \
 834         high = (1 << high);                     \
 835     case 7:                                     \
 836         low = _bit_scan_forward(bits);          \
 837         bits ^= high | (1 << low);              \
 838         d.m(low) = ith_value(low);              \
 839     case 6:                                     \
 840         high = _bit_scan_reverse(bits);         \
 841         d.m(high) = ith_value(high);            \
 842         high = (1 << high);                     \
 843     case 5:                                     \
 844         low = _bit_scan_forward(bits);          \
 845         bits ^= high | (1 << low);              \
 846         d.m(low) = ith_value(low);              \
 847     case 4:                                     \
 848         high = _bit_scan_reverse(bits);         \
 849         d.m(high) = ith_value(high);            \
 850         high = (1 << high);                     \
 851     case 3:                                     \
 852         low = _bit_scan_forward(bits);          \
 853         bits ^= high | (1 << low);              \
 854         d.m(low) = ith_value(low);              \
 855     case 2:                                     \
 856         high = _bit_scan_reverse(bits);         \
 857         d.m(high) = ith_value(high);            \
 858     case 1:                                     \
 859         low = _bit_scan_forward(bits);          \
 860         d.m(low) = ith_value(low);              \
 861     case 0:                                     \
 862         break;                                  \
 863     }
 864 #else
 865 #define VC_MASKED_GATHER                        \
 866     if (mask.isEmpty()) {                       \
 867         return;                                 \
 868     }                                           \
 869     for_all_vector_entries(i,                   \
 870             if (mask[i]) d.m(i) = ith_value(i); \
 871             );
 872 #endif
 873
 874 template<typename T> template<typename Index>
 875 Vc_INTRINSIC void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask)
 876 {
 877     IndexSizeChecker<Index, Size>::check();
 878 #define ith_value(_i_) (mem[indexes[_i_]])
 879     VC_MASKED_GATHER
 880 #undef ith_value
 881 }
 882
 883 template<> template<typename S1, typename IT>
 884 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
 885 {
 886     IndexSizeChecker<IT, Size>::check();
 887     d.v() = _mm_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1));
 888 }
 889 template<> template<typename S1, typename IT>
 890 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
 891 {
 892     IndexSizeChecker<IT, Size>::check();
 893     d.v() = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 894             array[indexes[3]].*(member1));
 895 }
 896 template<> template<typename S1, typename IT>
 897 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
 898 {
 899     IndexSizeChecker<IT, Size>::check();
 900     d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 901             array[indexes[3]].*(member1));
 902     d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1),
 903             array[indexes[7]].*(member1));
 904 }
 905 template<> template<typename S1, typename IT>
 906 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
 907 {
 908     IndexSizeChecker<IT, Size>::check();
 909     d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 910             array[indexes[3]].*(member1));
 911 }
 912 template<> template<typename S1, typename IT>
 913 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
 914 {
 915     IndexSizeChecker<IT, Size>::check();
 916     d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 917             array[indexes[3]].*(member1));
 918 }
 919 template<> template<typename S1, typename IT>
 920 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
 921 {
 922     IndexSizeChecker<IT, Size>::check();
 923     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 924             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 925             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 926 }
 927 template<> template<typename S1, typename IT>
 928 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
 929 {
 930     IndexSizeChecker<IT, Size>::check();
 931     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 932             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 933             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 934 }
 935 template<typename T> template<typename S1, typename IT>
 936 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
 937 {
 938     IndexSizeChecker<IT, Size>::check();
 939 #define ith_value(_i_) (array[indexes[_i_]].*(member1))
 940     VC_MASKED_GATHER
 941 #undef ith_value
 942 }
 943 template<> template<typename S1, typename S2, typename IT>
 944 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
 945 {
 946     IndexSizeChecker<IT, Size>::check();
 947     d.v() = _mm_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2));
 948 }
 949 template<> template<typename S1, typename S2, typename IT>
 950 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
 951 {
 952     IndexSizeChecker<IT, Size>::check();
 953     d.v() = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 954             array[indexes[3]].*(member1).*(member2));
 955 }
 956 template<> template<typename S1, typename S2, typename IT>
 957 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
 958 {
 959     IndexSizeChecker<IT, Size>::check();
 960     d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
 961             array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
 962     d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 963             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 964 }
 965 template<> template<typename S1, typename S2, typename IT>
 966 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
 967 {
 968     IndexSizeChecker<IT, Size>::check();
 969     d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
 970             array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
 971 }
 972 template<> template<typename S1, typename S2, typename IT>
 973 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
 974 {
 975     IndexSizeChecker<IT, Size>::check();
 976     d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
 977             array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
 978 }
 979 template<> template<typename S1, typename S2, typename IT>
 980 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
 981 {
 982     IndexSizeChecker<IT, Size>::check();
 983     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 984             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 985             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 986 }
 987 template<> template<typename S1, typename S2, typename IT>
 988 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
 989 {
 990     IndexSizeChecker<IT, Size>::check();
 991     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 992             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 993             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 994 }
 995 template<typename T> template<typename S1, typename S2, typename IT>
 996 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
 997 {
 998     IndexSizeChecker<IT, Size>::check();
 999 #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2))
1000     VC_MASKED_GATHER
1001 #undef ith_value
1002 }
1003 template<> template<typename S1, typename IT1, typename IT2>
1004 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
1005 {
1006     IndexSizeChecker<IT1, Size>::check();
1007     IndexSizeChecker<IT2, Size>::check();
1008     d.v() = _mm_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]],
1009             (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]]);
1010 }
1011 template<> template<typename S1, typename IT1, typename IT2>
1012 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
1013 {
1014     IndexSizeChecker<IT1, Size>::check();
1015     IndexSizeChecker<IT2, Size>::check();
1016     d.v() = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1017             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1018 }
1019 template<> template<typename S1, typename IT1, typename IT2>
1020 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
1021 {
1022     IndexSizeChecker<IT1, Size>::check();
1023     IndexSizeChecker<IT2, Size>::check();
1024     d.v()[0] = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1025             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1026     d.v()[1] = _mm_setr_ps((array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1027             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1028 }
1029 template<> template<typename S1, typename IT1, typename IT2>
1030 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
1031 {
1032     IndexSizeChecker<IT1, Size>::check();
1033     IndexSizeChecker<IT2, Size>::check();
1034     d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1035             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1036 }
1037 template<> template<typename S1, typename IT1, typename IT2>
1038 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
1039 {
1040     IndexSizeChecker<IT1, Size>::check();
1041     IndexSizeChecker<IT2, Size>::check();
1042     d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1043             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1044 }
1045 template<> template<typename S1, typename IT1, typename IT2>
1046 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
1047 {
1048     IndexSizeChecker<IT1, Size>::check();
1049     IndexSizeChecker<IT2, Size>::check();
1050     d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1051             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
1052             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1053             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1054 }
1055 template<> template<typename S1, typename IT1, typename IT2>
1056 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
1057 {
1058     IndexSizeChecker<IT1, Size>::check();
1059     IndexSizeChecker<IT2, Size>::check();
1060     d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1061             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
1062             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1063             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1064 }
1065 template<typename T> template<typename S1, typename IT1, typename IT2>
1066 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask)
1067 {
1068     IndexSizeChecker<IT1, Size>::check();
1069     IndexSizeChecker<IT2, Size>::check();
1070 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1071     VC_MASKED_GATHER
1072 #undef ith_value
1073 }
1074 // scatters {{{1
1075 #undef VC_MASKED_GATHER
1076 #ifdef VC_USE_BSF_SCATTERS
1077 #define VC_MASKED_SCATTER                       \
1078     int bits = mask.toInt();                    \
1079     while (bits) {                              \
1080         const int i = _bit_scan_forward(bits);  \
1081         bits ^= (1 << i); /* btr? */            \
1082         ith_value(i) = d.m(i);                  \
1083     }
1084 #elif defined(VC_USE_POPCNT_BSF_SCATTERS)
1085 #define VC_MASKED_SCATTER                       \
1086     unsigned int bits = mask.toInt();           \
1087     unsigned int low, high = 0;                 \
1088     switch (mask.count()) {             \
1089     case 8:                                     \
1090         high = _bit_scan_reverse(bits);         \
1091         ith_value(high) = d.m(high);            \
1092         high = (1 << high);                     \
1093     case 7:                                     \
1094         low = _bit_scan_forward(bits);          \
1095         bits ^= high | (1 << low);              \
1096         ith_value(low) = d.m(low);              \
1097     case 6:                                     \
1098         high = _bit_scan_reverse(bits);         \
1099         ith_value(high) = d.m(high);            \
1100         high = (1 << high);                     \
1101     case 5:                                     \
1102         low = _bit_scan_forward(bits);          \
1103         bits ^= high | (1 << low);              \
1104         ith_value(low) = d.m(low);              \
1105     case 4:                                     \
1106         high = _bit_scan_reverse(bits);         \
1107         ith_value(high) = d.m(high);            \
1108         high = (1 << high);                     \
1109     case 3:                                     \
1110         low = _bit_scan_forward(bits);          \
1111         bits ^= high | (1 << low);              \
1112         ith_value(low) = d.m(low);              \
1113     case 2:                                     \
1114         high = _bit_scan_reverse(bits);         \
1115         ith_value(high) = d.m(high);            \
1116     case 1:                                     \
1117         low = _bit_scan_forward(bits);          \
1118         ith_value(low) = d.m(low);              \
1119     case 0:                                     \
1120         break;                                  \
1121     }
1122 #else
1123 #define VC_MASKED_SCATTER                       \
1124     if (mask.isEmpty()) {                       \
1125         return;                                 \
1126     }                                           \
1127     for_all_vector_entries(i,                   \
1128             if (mask[i]) ith_value(i) = d.m(i); \
1129             );
1130 #endif
1131
1132 template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const
1133 {
1134     for_all_vector_entries(i,
1135             mem[indexes[i]] = d.m(i);
1136             );
1137 }
1138 template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const
1139 {
1140 #define ith_value(_i_) mem[indexes[_i_]]
1141     VC_MASKED_SCATTER
1142 #undef ith_value
1143 }
1144 template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const
1145 {
1146     for_all_vector_entries(i,
1147             array[indexes[i]].*(member1) = d.m(i);
1148             );
1149 }
1150 template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const
1151 {
1152 #define ith_value(_i_) array[indexes[_i_]].*(member1)
1153     VC_MASKED_SCATTER
1154 #undef ith_value
1155 }
1156 template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const
1157 {
1158     for_all_vector_entries(i,
1159             array[indexes[i]].*(member1).*(member2) = d.m(i);
1160             );
1161 }
1162 template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const
1163 {
1164 #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2)
1165     VC_MASKED_SCATTER
1166 #undef ith_value
1167 }
1168 template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const
1169 {
1170     for_all_vector_entries(i,
1171             (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i);
1172             );
1173 }
1174 template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const
1175 {
1176 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1177     VC_MASKED_SCATTER
1178 #undef ith_value
1179 }
1180
1181 ///////////////////////////////////////////////////////////////////////////////////////////
1182 // operator[] {{{1
1183 template<typename T> Vc_INTRINSIC typename Vector<T>::EntryType Vc_PURE Vector<T>::operator[](size_t index) const
1184 {
1185     return d.m(index);
1186 }
1187 #ifdef VC_GCC
1188 template<> Vc_INTRINSIC double Vc_PURE Vector<double>::operator[](size_t index) const
1189 {
1190     if (__builtin_constant_p(index)) {
1191         return extract_double_imm(d.v(), index);
1192     }
1193     return d.m(index);
1194 }
1195 template<> Vc_INTRINSIC float Vc_PURE Vector<float>::operator[](size_t index) const
1196 {
1197     return extract_float(d.v(), index);
1198 }
1199 template<> Vc_INTRINSIC float Vc_PURE Vector<float8>::operator[](size_t index) const
1200 {
1201     if (__builtin_constant_p(index)) {
1202         if (index < 4) {
1203             return extract_float_imm(d.v()[0], index);
1204         }
1205         return extract_float_imm(d.v()[1], index - 4);
1206     }
1207     return d.m(index);
1208 }
1209 template<> Vc_INTRINSIC int Vc_PURE Vector<int>::operator[](size_t index) const
1210 {
1211     if (__builtin_constant_p(index)) {
1212 #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following
1213 #ifdef __x86_64__
1214         if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull;
1215         if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32;
1216 #else
1217         if (index == 0) return _mm_cvtsi128_si32(d.v());
1218 #endif
1219 #endif
1220 #ifdef VC_IMPL_SSE4_1
1221         return _mm_extract_epi32(d.v(), index);
1222 #else
1223         return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4));
1224 #endif
1225     }
1226     return d.m(index);
1227 }
1228 template<> Vc_INTRINSIC unsigned int Vc_PURE Vector<unsigned int>::operator[](size_t index) const
1229 {
1230     if (__builtin_constant_p(index)) {
1231 #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following
1232 #ifdef __x86_64__
1233         if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull;
1234         if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32;
1235 #else
1236         if (index == 0) return _mm_cvtsi128_si32(d.v());
1237 #endif
1238 #endif
1239 #ifdef VC_IMPL_SSE4_1
1240         return _mm_extract_epi32(d.v(), index);
1241 #else
1242         return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4));
1243 #endif
1244     }
1245     return d.m(index);
1246 }
1247 template<> Vc_INTRINSIC short Vc_PURE Vector<short>::operator[](size_t index) const
1248 {
1249     if (__builtin_constant_p(index)) {
1250         return _mm_extract_epi16(d.v(), index);
1251     }
1252     return d.m(index);
1253 }
1254 template<> Vc_INTRINSIC unsigned short Vc_PURE Vector<unsigned short>::operator[](size_t index) const
1255 {
1256     if (__builtin_constant_p(index)) {
1257         return _mm_extract_epi16(d.v(), index);
1258     }
1259     return d.m(index);
1260 }
1261 #endif // GCC
1262 ///////////////////////////////////////////////////////////////////////////////////////////
1263 // horizontal ops {{{1
1264 #ifndef VC_IMPL_SSE4_1
1265 // without SSE4.1 integer multiplication is slow and we rather multiply the scalars
1266 template<> Vc_INTRINSIC Vc_PURE int Vector<int>::product() const
1267 {
1268     return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
1269 }
1270 template<> Vc_INTRINSIC Vc_PURE unsigned int Vector<unsigned int>::product() const
1271 {
1272     return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
1273 }
1274 #endif
1275 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const
1276 {
1277     Vector<T> tmp = std::numeric_limits<Vector<T> >::max();
1278     tmp(m) = *this;
1279     return tmp.min();
1280 }
1281 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const
1282 {
1283     Vector<T> tmp = std::numeric_limits<Vector<T> >::min();
1284     tmp(m) = *this;
1285     return tmp.max();
1286 }
1287 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const
1288 {
1289     Vector<T> tmp(VectorSpecialInitializerOne::One);
1290     tmp(m) = *this;
1291     return tmp.product();
1292 }
1293 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const
1294 {
1295     Vector<T> tmp(VectorSpecialInitializerZero::Zero);
1296     tmp(m) = *this;
1297     return tmp.sum();
1298 }
1299
1300 ///////////////////////////////////////////////////////////////////////////////////////////
1301 // copySign {{{1
1302 template<> Vc_INTRINSIC Vc_PURE Vector<float> Vector<float>::copySign(Vector<float>::AsArg reference) const
1303 {
1304     return _mm_or_ps(
1305             _mm_and_ps(reference.d.v(), _mm_setsignmask_ps()),
1306             _mm_and_ps(d.v(), _mm_setabsmask_ps())
1307             );
1308 }
1309 template<> Vc_INTRINSIC Vc_PURE Vector<float8> Vector<float8>::copySign(Vector<float8>::AsArg reference) const
1310 {
1311     return M256::create( _mm_or_ps(
1312                 _mm_and_ps(reference.d.v()[0], _mm_setsignmask_ps()),
1313                 _mm_and_ps(d.v()[0], _mm_setabsmask_ps())
1314                 ), _mm_or_ps(
1315                 _mm_and_ps(reference.d.v()[1], _mm_setsignmask_ps()),
1316                 _mm_and_ps(d.v()[1], _mm_setabsmask_ps())
1317                 )
1318             );
1319 }
1320 template<> Vc_INTRINSIC Vc_PURE Vector<double> Vector<double>::copySign(Vector<double>::AsArg reference) const
1321 {
1322     return _mm_or_pd(
1323             _mm_and_pd(reference.d.v(), _mm_setsignmask_pd()),
1324             _mm_and_pd(d.v(), _mm_setabsmask_pd())
1325             );
1326 }//}}}1
1327 // exponent {{{1
1328 template<> Vc_INTRINSIC Vc_PURE Vector<float> Vector<float>::exponent() const
1329 {
1330     VC_ASSERT((*this >= 0.f).isFull());
1331     return Internal::exponent(d.v());
1332 }
1333 template<> Vc_INTRINSIC Vc_PURE Vector<float8> Vector<float8>::exponent() const
1334 {
1335     VC_ASSERT((*this >= 0.f).isFull());
1336     return Internal::exponent(d.v());
1337 }
1338 template<> Vc_INTRINSIC Vc_PURE Vector<double> Vector<double>::exponent() const
1339 {
1340     VC_ASSERT((*this >= 0.).isFull());
1341     return Internal::exponent(d.v());
1342 }
1343 // }}}1
1344 // Random {{{1
1345 static void _doRandomStep(Vector<unsigned int> &state0,
1346         Vector<unsigned int> &state1)
1347 {
1348     state0.load(&Vc::RandomState[0]);
1349     state1.load(&Vc::RandomState[uint_v::Size]);
1350     (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]);
1351     uint_v(_mm_xor_si128((state0 * 0xdeece66du + 11).data(), _mm_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]);
1352 }
1353
1354 template<typename T> Vc_ALWAYS_INLINE Vector<T> Vector<T>::Random()
1355 {
1356     Vector<unsigned int> state0, state1;
1357     _doRandomStep(state0, state1);
1358     return state0.reinterpretCast<Vector<T> >();
1359 }
1360
1361 template<> Vc_ALWAYS_INLINE Vector<float> Vector<float>::Random()
1362 {
1363     Vector<unsigned int> state0, state1;
1364     _doRandomStep(state0, state1);
1365     return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1366 }
1367
1368 template<> Vc_ALWAYS_INLINE Vector<float8> Vector<float8>::Random()
1369 {
1370     Vector<unsigned int> state0, state1;
1371     _doRandomStep(state0, state1);
1372     state1 ^= state0 >> 16;
1373     return M256::create(
1374             _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one()),
1375             _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state1.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one())
1376             );
1377 }
1378
1379 template<> Vc_ALWAYS_INLINE Vector<double> Vector<double>::Random()
1380 {
1381     typedef unsigned long long uint64 Vc_MAY_ALIAS;
1382     uint64 state0 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[8]);
1383     uint64 state1 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[10]);
1384     const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Vc::RandomState[8]));
1385     *reinterpret_cast<uint64 *>(&Vc::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
1386     *reinterpret_cast<uint64 *>(&Vc::RandomState[10]) = (state1 * 0x5deece66dull + 11);
1387     return (Vector<double>(_mm_castsi128_pd(_mm_srli_epi64(state, 12))) | One()) - One();
1388 }
1389 // shifted / rotated {{{1
1390 template<typename T> Vc_INTRINSIC Vc_PURE Vector<T> Vector<T>::shifted(int amount) const
1391 {
1392     switch (amount) {
1393     case  0: return *this;
1394     case  1: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * sizeof(EntryType)));
1395     case  2: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * sizeof(EntryType)));
1396     case  3: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * sizeof(EntryType)));
1397     case  4: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * sizeof(EntryType)));
1398     case  5: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * sizeof(EntryType)));
1399     case  6: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * sizeof(EntryType)));
1400     case  7: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * sizeof(EntryType)));
1401     case  8: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * sizeof(EntryType)));
1402     case -1: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * sizeof(EntryType)));
1403     case -2: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * sizeof(EntryType)));
1404     case -3: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * sizeof(EntryType)));
1405     case -4: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * sizeof(EntryType)));
1406     case -5: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * sizeof(EntryType)));
1407     case -6: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * sizeof(EntryType)));
1408     case -7: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * sizeof(EntryType)));
1409     case -8: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * sizeof(EntryType)));
1410     }
1411     return Zero();
1412 }
1413 template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::shifted(int amount) const
1414 {
1415     switch (amount) {
1416     case -7: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))));
1417     case -6: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))));
1418     case -5: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))));
1419     case -4: return M256::create(_mm_setzero_ps(), d.v()[0]);
1420     case -3: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))));
1421     case -2: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))));
1422     case -1: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))));
1423     case  0: return *this;
1424     case  1: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * sizeof(EntryType))));
1425     case  2: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * sizeof(EntryType))));
1426     case  3: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * sizeof(EntryType))));
1427     case  4: return M256::create(d.v()[1], _mm_setzero_ps());
1428     case  5: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * sizeof(EntryType))), _mm_setzero_ps());
1429     case  6: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * sizeof(EntryType))), _mm_setzero_ps());
1430     case  7: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * sizeof(EntryType))), _mm_setzero_ps());
1431     }
1432     return Zero();
1433 }
1434 template<typename T> Vc_INTRINSIC Vc_PURE Vector<T> Vector<T>::rotated(int amount) const
1435 {
1436     const __m128i v = mm128_reinterpret_cast<__m128i>(d.v());
1437     switch (static_cast<unsigned int>(amount) % Size) {
1438     case  0: return *this;
1439     case  1: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 1 * sizeof(EntryType)));
1440     case  2: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 2 * sizeof(EntryType)));
1441     case  3: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 3 * sizeof(EntryType)));
1442              // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake.
1443              // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType))
1444              // disables the following four calls unless sizeof(EntryType) == 2.
1445     case  4: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 4 * sizeof(EntryType)));
1446     case  5: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 5 * sizeof(EntryType)));
1447     case  6: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 6 * sizeof(EntryType)));
1448     case  7: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 7 * sizeof(EntryType)));
1449     }
1450     return Zero();
1451 }
1452 template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::rotated(int amount) const
1453 {
1454     const __m128i v0 = sse_cast<__m128i>(d.v()[0]);
1455     const __m128i v1 = sse_cast<__m128i>(d.v()[1]);
1456     switch (static_cast<unsigned int>(amount) % Size) {
1457     case  0: return *this;
1458     case  1: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))));
1459     case  2: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))));
1460     case  3: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))));
1461     case  4: return M256::create(d.v()[1], d.v()[0]);
1462     case  5: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))));
1463     case  6: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))));
1464     case  7: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))));
1465     }
1466     return Zero();
1467 }
1468 // }}}1
1469 // sorted specializations {{{1
1470 template<> inline Vc_PURE uint_v uint_v::sorted() const
1471 {
1472     __m128i x = data();
1473     __m128i y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
1474     __m128i l = _mm_min_epu32(x, y);
1475     __m128i h = _mm_max_epu32(x, y);
1476     x = _mm_unpacklo_epi32(l, h);
1477     y = _mm_unpackhi_epi32(h, l);
1478
1479     // sort quads
1480     l = _mm_min_epu32(x, y);
1481     h = _mm_max_epu32(x, y);
1482     x = _mm_unpacklo_epi32(l, h);
1483     y = _mm_unpackhi_epi64(x, x);
1484
1485     l = _mm_min_epu32(x, y);
1486     h = _mm_max_epu32(x, y);
1487     return _mm_unpacklo_epi32(l, h);
1488 }
1489 template<> inline Vc_PURE ushort_v ushort_v::sorted() const
1490 {
1491     __m128i lo, hi, y, x = data();
1492     // sort pairs
1493     y = Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(x);
1494     lo = _mm_min_epu16(x, y);
1495     hi = _mm_max_epu16(x, y);
1496     x = _mm_blend_epi16(lo, hi, 0xaa);
1497
1498     // merge left and right quads
1499     y = Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(x);
1500     lo = _mm_min_epu16(x, y);
1501     hi = _mm_max_epu16(x, y);
1502     x = _mm_blend_epi16(lo, hi, 0xcc);
1503     y = _mm_srli_si128(x, 2);
1504     lo = _mm_min_epu16(x, y);
1505     hi = _mm_max_epu16(x, y);
1506     x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
1507
1508     // merge quads into octs
1509     y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
1510     y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
1511     lo = _mm_min_epu16(x, y);
1512     hi = _mm_max_epu16(x, y);
1513
1514     x = _mm_unpacklo_epi16(lo, hi);
1515     y = _mm_srli_si128(x, 8);
1516     lo = _mm_min_epu16(x, y);
1517     hi = _mm_max_epu16(x, y);
1518
1519     x = _mm_unpacklo_epi16(lo, hi);
1520     y = _mm_srli_si128(x, 8);
1521     lo = _mm_min_epu16(x, y);
1522     hi = _mm_max_epu16(x, y);
1523
1524     return _mm_unpacklo_epi16(lo, hi);
1525 }
1526 // }}}1
1527 } // namespace SSE
1528 } // namespace Vc
1529 } // namespace AliRoot
1530
1531 #include "undomacros.h"
1532
1533 // vim: foldmethod=marker