Vc/include/Vc/avx/vector.tcc

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #include "limits.h"
  21 #include "const.h"
  22 #include "macros.h"
  23
  24 namespace Vc
  25 {
  26 ALIGN(64) extern unsigned int RandomState[16];
  27
  28 namespace AVX
  29 {
  30
  31 ///////////////////////////////////////////////////////////////////////////////////////////
  32 // constants {{{1
  33 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {}
  34 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {}
  35 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum)
  36     : d(HV::load(IndexesFromZeroData<T>::address(), Aligned)) {}
  37
  38 template<typename T> inline Vector<T> INTRINSIC CONST Vector<T>::Zero() { return HT::zero(); }
  39 template<typename T> inline Vector<T> INTRINSIC CONST Vector<T>::One() { return HT::one(); }
  40 template<typename T> inline Vector<T> INTRINSIC CONST Vector<T>::IndexesFromZero() { return HV::load(IndexesFromZeroData<T>::address(), Aligned); }
  41
  42 template<typename T> template<typename T2> inline ALWAYS_INLINE Vector<T>::Vector(Vector<T2> x)
  43     : d(StaticCastHelper<T2, T>::cast(x.data())) {}
  44
  45 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(EntryType x) : d(HT::set(x)) {}
  46 template<> inline ALWAYS_INLINE Vector<double>::Vector(EntryType x) : d(_mm256_set1_pd(x)) {}
  47
  48
  49 ///////////////////////////////////////////////////////////////////////////////////////////
  50 // load ctors {{{1
  51 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); }
  52 template<typename T> template<typename A> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); }
  53 template<typename T> template<typename OtherT> inline ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); }
  54 template<typename T> template<typename OtherT, typename A> inline ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); }
  55
  56 ///////////////////////////////////////////////////////////////////////////////////////////
  57 // load member functions {{{1
  58 template<typename T> inline void INTRINSIC Vector<T>::load(const EntryType *mem)
  59 {
  60     load(mem, Aligned);
  61 }
  62
  63 template<typename T> template<typename A> inline void INTRINSIC Vector<T>::load(const EntryType *mem, A align)
  64 {
  65     d.v() = HV::load(mem, align);
  66 }
  67
  68 template<typename T> template<typename OtherT> inline void INTRINSIC Vector<T>::load(const OtherT *mem)
  69 {
  70     load(mem, Aligned);
  71 }
  72
  73 // LoadHelper {{{2
  74 template<typename DstT, typename SrcT, typename Flags> struct LoadHelper;
  75
  76 // float {{{2
  77 template<typename Flags> struct LoadHelper<float, double, Flags> {
  78     static __m256 load(const double *mem, Flags f)
  79     {
  80         return concat(_mm256_cvtpd_ps(VectorHelper<__m256d>::load(&mem[0], f)),
  81                       _mm256_cvtpd_ps(VectorHelper<__m256d>::load(&mem[4], f)));
  82     }
  83 };
  84 template<typename Flags> struct LoadHelper<float, unsigned int, Flags> {
  85     static __m256 load(const unsigned int *mem, Flags f)
  86     {
  87         return StaticCastHelper<unsigned int, float>::cast(VectorHelper<__m256i>::load(mem, f));
  88     }
  89 };
  90 template<typename Flags> struct LoadHelper<float, int, Flags> {
  91     static __m256 load(const int *mem, Flags f)
  92     {
  93         return StaticCastHelper<int, float>::cast(VectorHelper<__m256i>::load(mem, f));
  94     }
  95 };
  96 template<typename Flags> struct LoadHelper<float, unsigned short, Flags> {
  97     static __m256 load(const unsigned short *mem, Flags f)
  98     {
  99         return StaticCastHelper<unsigned short, float>::cast(VectorHelper<__m128i>::load(mem, f));
 100     }
 101 };
 102 template<typename Flags> struct LoadHelper<float, short, Flags> {
 103     static __m256 load(const short *mem, Flags f)
 104     {
 105         return StaticCastHelper<short, float>::cast(VectorHelper<__m128i>::load(mem, f));
 106     }
 107 };
 108 template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
 109     static __m256 load(const unsigned char *mem, Flags f)
 110     {
 111         return StaticCastHelper<unsigned int, float>::cast(LoadHelper<unsigned int, unsigned char, Flags>::load(mem, f));
 112     }
 113 };
 114 template<typename Flags> struct LoadHelper<float, signed char, Flags> {
 115     static __m256 load(const signed char *mem, Flags f)
 116     {
 117         return StaticCastHelper<int, float>::cast(LoadHelper<int, signed char, Flags>::load(mem, f));
 118     }
 119 };
 120
 121 template<typename SrcT, typename Flags> struct LoadHelper<sfloat, SrcT, Flags> : public LoadHelper<float, SrcT, Flags> {};
 122
 123 // int {{{2
 124 template<typename Flags> struct LoadHelper<int, unsigned int, Flags> {
 125     static __m256i load(const unsigned int *mem, Flags f)
 126     {
 127         return VectorHelper<__m256i>::load(mem, f);
 128     }
 129 };
 130 template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
 131     static __m256i load(const unsigned short *mem, Flags f)
 132     {
 133         return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<__m128i>::load(mem, f));
 134     }
 135 };
 136 template<typename Flags> struct LoadHelper<int, short, Flags> {
 137     static __m256i load(const short *mem, Flags f)
 138     {
 139         return StaticCastHelper<short, int>::cast(VectorHelper<__m128i>::load(mem, f));
 140     }
 141 };
 142 template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
 143     static __m256i load(const unsigned char *mem, Flags)
 144     {
 145         // the only available streaming load loads 16 bytes - twice as much as we need => can't use
 146         // it, or we risk an out-of-bounds read and an unaligned load exception
 147         const __m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
 148         const __m128i epu16 = _mm_cvtepu8_epi16(epu8);
 149         return StaticCastHelper<unsigned short, unsigned int>::cast(epu16);
 150     }
 151 };
 152 template<typename Flags> struct LoadHelper<int, signed char, Flags> {
 153     static __m256i load(const signed char *mem, Flags)
 154     {
 155         // the only available streaming load loads 16 bytes - twice as much as we need => can't use
 156         // it, or we risk an out-of-bounds read and an unaligned load exception
 157         const __m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
 158         const __m128i epi16 = _mm_cvtepi8_epi16(epi8);
 159         return StaticCastHelper<short, int>::cast(epi16);
 160     }
 161 };
 162
 163 // unsigned int {{{2
 164 template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
 165     static __m256i load(const unsigned short *mem, Flags f)
 166     {
 167         return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<__m128i>::load(mem, f));
 168     }
 169 };
 170 template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
 171     static __m256i load(const unsigned char *mem, Flags)
 172     {
 173         // the only available streaming load loads 16 bytes - twice as much as we need => can't use
 174         // it, or we risk an out-of-bounds read and an unaligned load exception
 175         const __m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
 176         const __m128i epu16 = _mm_cvtepu8_epi16(epu8);
 177         return StaticCastHelper<unsigned short, unsigned int>::cast(epu16);
 178     }
 179 };
 180
 181 // short {{{2
 182 template<typename Flags> struct LoadHelper<short, unsigned short, Flags> {
 183     static __m128i load(const unsigned short *mem, Flags f)
 184     {
 185         return StaticCastHelper<unsigned short, short>::cast(VectorHelper<__m128i>::load(mem, f));
 186     }
 187 };
 188 template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
 189     static __m128i load(const unsigned char *mem, Flags)
 190     {
 191         // the only available streaming load loads 16 bytes - twice as much as we need => can't use
 192         // it, or we risk an out-of-bounds read and an unaligned load exception
 193         const __m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
 194         return _mm_cvtepu8_epi16(epu8);
 195     }
 196 };
 197 template<typename Flags> struct LoadHelper<short, signed char, Flags> {
 198     static __m128i load(const signed char *mem, Flags)
 199     {
 200         // the only available streaming load loads 16 bytes - twice as much as we need => can't use
 201         // it, or we risk an out-of-bounds read and an unaligned load exception
 202         const __m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
 203         return _mm_cvtepi8_epi16(epi8);
 204     }
 205 };
 206
 207 // unsigned short {{{2
 208 template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
 209     static __m128i load(const unsigned char *mem, Flags)
 210     {
 211         // the only available streaming load loads 16 bytes - twice as much as we need => can't use
 212         // it, or we risk an out-of-bounds read and an unaligned load exception
 213         const __m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
 214         return _mm_cvtepu8_epi16(epu8);
 215     }
 216 };
 217
 218 // general load, implemented via LoadHelper {{{2
 219 template<typename DstT> template<typename SrcT, typename Flags> inline void INTRINSIC Vector<DstT>::load(const SrcT *x, Flags f)
 220 {
 221     d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f);
 222 }
 223
 224 ///////////////////////////////////////////////////////////////////////////////////////////
 225 // zeroing {{{1
 226 template<typename T> inline void INTRINSIC Vector<T>::setZero()
 227 {
 228     data() = HV::zero();
 229 }
 230 template<typename T> inline void INTRINSIC Vector<T>::setZero(const Mask &k)
 231 {
 232     data() = HV::andnot_(avx_cast<VectorType>(k.data()), data());
 233 }
 234
 235 template<> inline void INTRINSIC Vector<double>::setQnan()
 236 {
 237     data() = _mm256_setallone_pd();
 238 }
 239 template<> inline void INTRINSIC Vector<double>::setQnan(MaskArg k)
 240 {
 241     data() = _mm256_or_pd(data(), k.dataD());
 242 }
 243 template<> inline void INTRINSIC Vector<float>::setQnan()
 244 {
 245     data() = _mm256_setallone_ps();
 246 }
 247 template<> inline void INTRINSIC Vector<float>::setQnan(MaskArg k)
 248 {
 249     data() = _mm256_or_ps(data(), k.data());
 250 }
 251 template<> inline void INTRINSIC Vector<sfloat>::setQnan()
 252 {
 253     data() = _mm256_setallone_ps();
 254 }
 255 template<> inline void INTRINSIC Vector<sfloat>::setQnan(MaskArg k)
 256 {
 257     data() = _mm256_or_ps(data(), k.data());
 258 }
 259
 260 ///////////////////////////////////////////////////////////////////////////////////////////
 261 // stores {{{1
 262 template<typename T> inline void INTRINSIC Vector<T>::store(EntryType *mem) const
 263 {
 264     HV::store(mem, data(), Aligned);
 265 }
 266 template<typename T> inline void INTRINSIC Vector<T>::store(EntryType *mem, const Mask &mask) const
 267 {
 268     HV::store(mem, data(), avx_cast<VectorType>(mask.data()), Aligned);
 269 }
 270 template<typename T> template<typename A> inline void INTRINSIC Vector<T>::store(EntryType *mem, A align) const
 271 {
 272     HV::store(mem, data(), align);
 273 }
 274 template<typename T> template<typename A> inline void INTRINSIC Vector<T>::store(EntryType *mem, const Mask &mask, A align) const
 275 {
 276     HV::store(mem, data(), avx_cast<VectorType>(mask.data()), align);
 277 }
 278
 279 ///////////////////////////////////////////////////////////////////////////////////////////
 280 // expand/merge 1 float_v <=> 2 double_v          XXX rationale? remove it for release? XXX {{{1
 281 template<typename T> inline ALWAYS_INLINE FLATTEN Vector<T>::Vector(const Vector<typename HT::ConcatType> *a)
 282     : d(a[0])
 283 {
 284 }
 285 template<> inline ALWAYS_INLINE FLATTEN Vector<float>::Vector(const Vector<HT::ConcatType> *a)
 286     : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data())))
 287 {
 288 }
 289 template<> inline ALWAYS_INLINE FLATTEN Vector<short>::Vector(const Vector<HT::ConcatType> *a)
 290     : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data())))
 291 {
 292 }
 293 template<> inline ALWAYS_INLINE FLATTEN Vector<unsigned short>::Vector(const Vector<HT::ConcatType> *a)
 294     : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data())))
 295 {
 296 }
 297 template<typename T> inline void ALWAYS_INLINE FLATTEN Vector<T>::expand(Vector<typename HT::ConcatType> *x) const
 298 {
 299     x[0] = *this;
 300 }
 301 template<> inline void ALWAYS_INLINE FLATTEN Vector<float>::expand(Vector<HT::ConcatType> *x) const
 302 {
 303     x[0].data() = _mm256_cvtps_pd(lo128(d.v()));
 304     x[1].data() = _mm256_cvtps_pd(hi128(d.v()));
 305 }
 306 template<> inline void ALWAYS_INLINE FLATTEN Vector<short>::expand(Vector<HT::ConcatType> *x) const
 307 {
 308     x[0].data() = concat(_mm_cvtepi16_epi32(d.v()),
 309             _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v())));
 310 }
 311 template<> inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::expand(Vector<HT::ConcatType> *x) const
 312 {
 313     x[0].data() = concat(_mm_cvtepu16_epi32(d.v()),
 314             _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v())));
 315 }
 316
 317 ///////////////////////////////////////////////////////////////////////////////////////////
 318 // swizzles {{{1
 319 template<typename T> inline const Vector<T> INTRINSIC CONST &Vector<T>::abcd() const { return *this; }
 320 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); }
 321 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
 322 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); }
 323 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); }
 324 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); }
 325 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); }
 326 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); }
 327 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); }
 328 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); }
 329 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); }
 330 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); }
 331 template<typename T> inline const Vector<T> INTRINSIC CONST  Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); }
 332
 333 template<> inline const double_v INTRINSIC CONST Vector<double>::cdab() const { return Mem::shuffle128<X1, X0>(data(), data()); }
 334 template<> inline const double_v INTRINSIC CONST Vector<double>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
 335 template<> inline const double_v INTRINSIC CONST Vector<double>::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); }
 336 template<> inline const double_v INTRINSIC CONST Vector<double>::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); }
 337 template<> inline const double_v INTRINSIC CONST Vector<double>::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); }
 338 template<> inline const double_v INTRINSIC CONST Vector<double>::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); }
 339 template<> inline const double_v INTRINSIC CONST Vector<double>::bcad() const { return Mem::shuffle<X1, Y0, X2, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); }
 340 template<> inline const double_v INTRINSIC CONST Vector<double>::bcda() const { return Mem::shuffle<X1, Y0, X3, Y2>(data(), Mem::shuffle128<X1, X0>(data(), data())); }
 341 template<> inline const double_v INTRINSIC CONST Vector<double>::dabc() const { return Mem::shuffle<X1, Y0, X3, Y2>(Mem::shuffle128<X1, X0>(data(), data()), data()); }
 342 template<> inline const double_v INTRINSIC CONST Vector<double>::acbd() const { return Mem::shuffle<X0, Y0, X3, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); }
 343 template<> inline const double_v INTRINSIC CONST Vector<double>::dbca() const { return Mem::shuffle<X1, Y1, X2, Y2>(Mem::shuffle128<X1, X1>(data(), data()), Mem::shuffle128<X0, X0>(data(), data())); }
 344 template<> inline const double_v INTRINSIC CONST Vector<double>::dcba() const { return cdab().badc(); }
 345
 346 #define VC_SWIZZLES_16BIT_IMPL(T) \
 347 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \
 348 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \
 349 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \
 350 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \
 351 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \
 352 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \
 353 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \
 354 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \
 355 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \
 356 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \
 357 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \
 358 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); }
 359 VC_SWIZZLES_16BIT_IMPL(short)
 360 VC_SWIZZLES_16BIT_IMPL(unsigned short)
 361 #undef VC_SWIZZLES_16BIT_IMPL
 362
 363 ///////////////////////////////////////////////////////////////////////////////////////////
 364 // division {{{1
 365 template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x)
 366 {
 367     if (HasVectorDivision) {
 368         return operator/=(Vector<T>(x));
 369     }
 370     for_all_vector_entries(i,
 371             d.m(i) /= x;
 372             );
 373     return *this;
 374 }
 375 template<typename T> template<typename TT> inline PURE VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const
 376 {
 377     if (HasVectorDivision) {
 378         return operator/(Vector<T>(x));
 379     }
 380     Vector<T> r;
 381     for_all_vector_entries(i,
 382             r.d.m(i) = d.m(i) / x;
 383             );
 384     return r;
 385 }
 386 // per default fall back to scalar division
 387 template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x)
 388 {
 389     for_all_vector_entries(i,
 390             d.m(i) /= x.d.m(i);
 391             );
 392     return *this;
 393 }
 394
 395 template<typename T> inline Vector<T> PURE Vector<T>::operator/(const Vector<T> &x) const
 396 {
 397     Vector<T> r;
 398     for_all_vector_entries(i,
 399             r.d.m(i) = d.m(i) / x.d.m(i);
 400             );
 401     return r;
 402 }
 403 // specialize division on type
 404 static inline __m256i INTRINSIC CONST divInt(__m256i a, __m256i b) {
 405     const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
 406     const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
 407     const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
 408     const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
 409     return concat(
 410             _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
 411             _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2))
 412             );
 413 }
 414 template<> inline Vector<int> &Vector<int>::operator/=(const Vector<int> &x)
 415 {
 416     d.v() = divInt(d.v(), x.d.v());
 417     return *this;
 418 }
 419 template<> inline Vector<int> PURE Vector<int>::operator/(const Vector<int> &x) const
 420 {
 421     return divInt(d.v(), x.d.v());
 422 }
 423 static inline __m256i CONST divUInt(__m256i a, __m256i b) {
 424     __m256d loa = _mm256_cvtepi32_pd(lo128(a));
 425     __m256d hia = _mm256_cvtepi32_pd(hi128(a));
 426     __m256d lob = _mm256_cvtepi32_pd(lo128(b));
 427     __m256d hib = _mm256_cvtepi32_pd(hi128(b));
 428     // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32)
 429     // to get the right number back we have to add 2^32 where a >= 2^31
 430     loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.)));
 431     hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.)));
 432     // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and
 433     // we rather want the standard stuff fast
 434     //
 435     // there is one remaining problem: a >= 2^31 and b == 1
 436     // in that case the return value would be 2^31
 437     return avx_cast<__m256i>(_mm256_blendv_ps(avx_cast<__m256>(concat(
 438                         _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
 439                         _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib))
 440                         )), avx_cast<__m256>(a), avx_cast<__m256>(concat(
 441                             _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()),
 442                             _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32())))));
 443 }
 444 template<> inline Vector<unsigned int> ALWAYS_INLINE &Vector<unsigned int>::operator/=(const Vector<unsigned int> &x)
 445 {
 446     d.v() = divUInt(d.v(), x.d.v());
 447     return *this;
 448 }
 449 template<> inline Vector<unsigned int> ALWAYS_INLINE PURE Vector<unsigned int>::operator/(const Vector<unsigned int> &x) const
 450 {
 451     return divUInt(d.v(), x.d.v());
 452 }
 453 template<typename T> static inline __m128i CONST divShort(__m128i a, __m128i b)
 454 {
 455     const __m256 r = _mm256_div_ps(StaticCastHelper<T, float>::cast(a),
 456             StaticCastHelper<T, float>::cast(b));
 457     return StaticCastHelper<float, T>::cast(r);
 458 }
 459 template<> inline Vector<short> ALWAYS_INLINE &Vector<short>::operator/=(const Vector<short> &x)
 460 {
 461     d.v() = divShort<short>(d.v(), x.d.v());
 462     return *this;
 463 }
 464 template<> inline Vector<short> ALWAYS_INLINE PURE Vector<short>::operator/(const Vector<short> &x) const
 465 {
 466     return divShort<short>(d.v(), x.d.v());
 467 }
 468 template<> inline Vector<unsigned short> ALWAYS_INLINE &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x)
 469 {
 470     d.v() = divShort<unsigned short>(d.v(), x.d.v());
 471     return *this;
 472 }
 473 template<> inline Vector<unsigned short> ALWAYS_INLINE PURE Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const
 474 {
 475     return divShort<unsigned short>(d.v(), x.d.v());
 476 }
 477 template<> inline Vector<float> INTRINSIC &Vector<float>::operator/=(const Vector<float> &x)
 478 {
 479     d.v() = _mm256_div_ps(d.v(), x.d.v());
 480     return *this;
 481 }
 482 template<> inline Vector<float> INTRINSIC PURE Vector<float>::operator/(const Vector<float> &x) const
 483 {
 484     return _mm256_div_ps(d.v(), x.d.v());
 485 }
 486 template<> inline Vector<double> INTRINSIC &Vector<double>::operator/=(const Vector<double> &x)
 487 {
 488     d.v() = _mm256_div_pd(d.v(), x.d.v());
 489     return *this;
 490 }
 491 template<> inline Vector<double> INTRINSIC PURE Vector<double>::operator/(const Vector<double> &x) const
 492 {
 493     return _mm256_div_pd(d.v(), x.d.v());
 494 }
 495
 496 ///////////////////////////////////////////////////////////////////////////////////////////
 497 // integer ops {{{1
 498 #define OP_IMPL(T, symbol) \
 499 template<> inline Vector<T> &Vector<T>::operator symbol##=(AsArg x) \
 500 { \
 501     for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \
 502     return *this; \
 503 } \
 504 template<> inline Vector<T>  Vector<T>::operator symbol(AsArg x) const \
 505 { \
 506     Vector<T> r; \
 507     for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \
 508     return r; \
 509 }
 510 OP_IMPL(int, <<)
 511 OP_IMPL(int, >>)
 512 OP_IMPL(unsigned int, <<)
 513 OP_IMPL(unsigned int, >>)
 514 OP_IMPL(short, <<)
 515 OP_IMPL(short, >>)
 516 OP_IMPL(unsigned short, <<)
 517 OP_IMPL(unsigned short, >>)
 518 #undef OP_IMPL
 519
 520 template<typename T> inline Vector<T> &Vector<T>::operator>>=(int shift) {
 521     d.v() = VectorHelper<T>::shiftRight(d.v(), shift);
 522     return *static_cast<Vector<T> *>(this);
 523 }
 524 template<typename T> inline Vector<T> Vector<T>::operator>>(int shift) const {
 525     return VectorHelper<T>::shiftRight(d.v(), shift);
 526 }
 527 template<typename T> inline Vector<T> &Vector<T>::operator<<=(int shift) {
 528     d.v() = VectorHelper<T>::shiftLeft(d.v(), shift);
 529     return *static_cast<Vector<T> *>(this);
 530 }
 531 template<typename T> inline Vector<T> Vector<T>::operator<<(int shift) const {
 532     return VectorHelper<T>::shiftLeft(d.v(), shift);
 533 }
 534
 535 #define OP_IMPL(T, symbol, fun) \
 536   template<> inline Vector<T> &Vector<T>::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \
 537   template<> inline Vector<T>  Vector<T>::operator symbol(AsArg x) const { return Vector<T>(HV::fun(d.v(), x.d.v())); }
 538   OP_IMPL(int, &, and_)
 539   OP_IMPL(int, |, or_)
 540   OP_IMPL(int, ^, xor_)
 541   OP_IMPL(unsigned int, &, and_)
 542   OP_IMPL(unsigned int, |, or_)
 543   OP_IMPL(unsigned int, ^, xor_)
 544   OP_IMPL(short, &, and_)
 545   OP_IMPL(short, |, or_)
 546   OP_IMPL(short, ^, xor_)
 547   OP_IMPL(unsigned short, &, and_)
 548   OP_IMPL(unsigned short, |, or_)
 549   OP_IMPL(unsigned short, ^, xor_)
 550   OP_IMPL(float, &, and_)
 551   OP_IMPL(float, |, or_)
 552   OP_IMPL(float, ^, xor_)
 553   OP_IMPL(sfloat, &, and_)
 554   OP_IMPL(sfloat, |, or_)
 555   OP_IMPL(sfloat, ^, xor_)
 556   OP_IMPL(double, &, and_)
 557   OP_IMPL(double, |, or_)
 558   OP_IMPL(double, ^, xor_)
 559 #undef OP_IMPL
 560
 561 // operators {{{1
 562 #include "../common/operators.h"
 563 // gathers {{{1
 564 // Better implementation (hopefully) with _mm256_set_
 565 //X template<typename T> template<typename Index> Vector<T>::Vector(const EntryType *mem, const Index *indexes)
 566 //X {
 567 //X     for_all_vector_entries(int i,
 568 //X             d.m(i) = mem[indexes[i]];
 569 //X             );
 570 //X }
 571 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes)
 572 {
 573     gather(mem, indexes);
 574 }
 575 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, Vector<IndexT> indexes)
 576 {
 577     gather(mem, indexes);
 578 }
 579
 580 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask)
 581     : d(HT::zero())
 582 {
 583     gather(mem, indexes, mask);
 584 }
 585
 586 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, Vector<IndexT> indexes, MaskArg mask)
 587     : d(HT::zero())
 588 {
 589     gather(mem, indexes, mask);
 590 }
 591
 592 template<typename T> template<typename S1, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, IT indexes)
 593 {
 594     gather(array, member1, indexes);
 595 }
 596 template<typename T> template<typename S1, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, IT indexes, MaskArg mask)
 597     : d(HT::zero())
 598 {
 599     gather(array, member1, indexes, mask);
 600 }
 601 template<typename T> template<typename S1, typename S2, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
 602 {
 603     gather(array, member1, member2, indexes);
 604 }
 605 template<typename T> template<typename S1, typename S2, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes, MaskArg mask)
 606     : d(HT::zero())
 607 {
 608     gather(array, member1, member2, indexes, mask);
 609 }
 610 template<typename T> template<typename S1, typename IT1, typename IT2> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
 611 {
 612     gather(array, ptrMember1, outerIndexes, innerIndexes);
 613 }
 614 template<typename T> template<typename S1, typename IT1, typename IT2> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArg mask)
 615     : d(HT::zero())
 616 {
 617     gather(array, ptrMember1, outerIndexes, innerIndexes, mask);
 618 }
 619
 620 template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} };
 621 template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size>
 622 {
 623     static void check() {
 624         VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries);
 625     }
 626 };
 627 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const EntryType *mem, Index indexes)
 628 {
 629     IndexSizeChecker<Index, Size>::check();
 630     d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
 631 }
 632 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const EntryType *mem, Index indexes)
 633 {
 634     IndexSizeChecker<Index, Size>::check();
 635     d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 636             mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 637 }
 638 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<sfloat>::gather(const EntryType *mem, Index indexes)
 639 {
 640     IndexSizeChecker<Index, Size>::check();
 641     d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 642             mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 643 }
 644 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const EntryType *mem, Index indexes)
 645 {
 646     IndexSizeChecker<Index, Size>::check();
 647     d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 648             mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 649 }
 650 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const EntryType *mem, Index indexes)
 651 {
 652     IndexSizeChecker<Index, Size>::check();
 653     d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 654             mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 655 }
 656 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const EntryType *mem, Index indexes)
 657 {
 658     IndexSizeChecker<Index, Size>::check();
 659     d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 660             mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 661 }
 662 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const EntryType *mem, Index indexes)
 663 {
 664     IndexSizeChecker<Index, Size>::check();
 665     d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
 666                 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
 667 }
 668
 669 #ifdef VC_USE_SET_GATHERS
 670 template<typename T> template<typename IT> inline void ALWAYS_INLINE Vector<T>::gather(const EntryType *mem, Vector<IT> indexes, MaskArg mask)
 671 {
 672     IndexSizeChecker<Vector<IT>, Size>::check();
 673     indexes.setZero(!mask);
 674     (*this)(mask) = Vector<T>(mem, indexes);
 675 }
 676 #endif
 677
 678 #ifdef VC_USE_BSF_GATHERS
 679 #define VC_MASKED_GATHER                        \
 680     int bits = mask.toInt();                    \
 681     while (bits) {                              \
 682         const int i = _bit_scan_forward(bits);  \
 683         bits &= ~(1 << i); /* btr? */           \
 684         d.m(i) = ith_value(i);                  \
 685     }
 686 #elif defined(VC_USE_POPCNT_BSF_GATHERS)
 687 #define VC_MASKED_GATHER                        \
 688     unsigned int bits = mask.toInt();           \
 689     unsigned int low, high = 0;                 \
 690     switch (_mm_popcnt_u32(bits)) {             \
 691     case 8:                                     \
 692         high = _bit_scan_reverse(bits);         \
 693         d.m(high) = ith_value(high);            \
 694         high = (1 << high);                     \
 695     case 7:                                     \
 696         low = _bit_scan_forward(bits);          \
 697         bits ^= high | (1 << low);              \
 698         d.m(low) = ith_value(low);              \
 699     case 6:                                     \
 700         high = _bit_scan_reverse(bits);         \
 701         d.m(high) = ith_value(high);            \
 702         high = (1 << high);                     \
 703     case 5:                                     \
 704         low = _bit_scan_forward(bits);          \
 705         bits ^= high | (1 << low);              \
 706         d.m(low) = ith_value(low);              \
 707     case 4:                                     \
 708         high = _bit_scan_reverse(bits);         \
 709         d.m(high) = ith_value(high);            \
 710         high = (1 << high);                     \
 711     case 3:                                     \
 712         low = _bit_scan_forward(bits);          \
 713         bits ^= high | (1 << low);              \
 714         d.m(low) = ith_value(low);              \
 715     case 2:                                     \
 716         high = _bit_scan_reverse(bits);         \
 717         d.m(high) = ith_value(high);            \
 718     case 1:                                     \
 719         low = _bit_scan_forward(bits);          \
 720         d.m(low) = ith_value(low);              \
 721     case 0:                                     \
 722         break;                                  \
 723     }
 724 #else
 725 #define VC_MASKED_GATHER                        \
 726     if (mask.isEmpty()) {                       \
 727         return;                                 \
 728     }                                           \
 729     for_all_vector_entries(i,                   \
 730             if (mask[i]) d.m(i) = ith_value(i); \
 731             );
 732 #endif
 733
 734 template<typename T> template<typename Index>
 735 inline void INTRINSIC Vector<T>::gather(const EntryType *mem, Index indexes, MaskArg mask)
 736 {
 737     IndexSizeChecker<Index, Size>::check();
 738 #define ith_value(_i_) (mem[indexes[_i_]])
 739     VC_MASKED_GATHER
 740 #undef ith_value
 741 }
 742
 743 template<> template<typename S1, typename IT>
 744 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
 745 {
 746     IndexSizeChecker<IT, Size>::check();
 747     d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1),
 748             array[indexes[2]].*(member1), array[indexes[3]].*(member1));
 749 }
 750 template<> template<typename S1, typename IT>
 751 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
 752 {
 753     IndexSizeChecker<IT, Size>::check();
 754     d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 755             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 756             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 757 }
 758 template<> template<typename S1, typename IT>
 759 inline void ALWAYS_INLINE FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
 760 {
 761     IndexSizeChecker<IT, Size>::check();
 762     d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 763             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 764             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 765 }
 766 template<> template<typename S1, typename IT>
 767 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
 768 {
 769     IndexSizeChecker<IT, Size>::check();
 770     d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 771             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 772             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 773 }
 774 template<> template<typename S1, typename IT>
 775 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
 776 {
 777     IndexSizeChecker<IT, Size>::check();
 778     d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 779             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 780             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 781 }
 782 template<> template<typename S1, typename IT>
 783 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
 784 {
 785     IndexSizeChecker<IT, Size>::check();
 786     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 787             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 788             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 789 }
 790 template<> template<typename S1, typename IT>
 791 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
 792 {
 793     IndexSizeChecker<IT, Size>::check();
 794     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
 795             array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
 796             array[indexes[6]].*(member1), array[indexes[7]].*(member1));
 797 }
 798 template<typename T> template<typename S1, typename IT>
 799 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, IT indexes, MaskArg mask)
 800 {
 801     IndexSizeChecker<IT, Size>::check();
 802 #define ith_value(_i_) (array[indexes[_i_]].*(member1))
 803     VC_MASKED_GATHER
 804 #undef ith_value
 805 }
 806 template<> template<typename S1, typename S2, typename IT>
 807 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
 808 {
 809     IndexSizeChecker<IT, Size>::check();
 810     d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
 811             array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
 812 }
 813 template<> template<typename S1, typename S2, typename IT>
 814 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
 815 {
 816     IndexSizeChecker<IT, Size>::check();
 817     d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 818             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 819             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 820 }
 821 template<> template<typename S1, typename S2, typename IT>
 822 inline void ALWAYS_INLINE FLATTEN Vector<sfloat>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
 823 {
 824     IndexSizeChecker<IT, Size>::check();
 825     d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 826             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 827             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 828 }
 829 template<> template<typename S1, typename S2, typename IT>
 830 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
 831 {
 832     IndexSizeChecker<IT, Size>::check();
 833     d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 834             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 835             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 836 }
 837 template<> template<typename S1, typename S2, typename IT>
 838 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
 839 {
 840     IndexSizeChecker<IT, Size>::check();
 841     d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 842             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 843             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 844 }
 845 template<> template<typename S1, typename S2, typename IT>
 846 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
 847 {
 848     IndexSizeChecker<IT, Size>::check();
 849     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 850             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 851             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 852 }
 853 template<> template<typename S1, typename S2, typename IT>
 854 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
 855 {
 856     IndexSizeChecker<IT, Size>::check();
 857     d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
 858             array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
 859             array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
 860 }
 861 template<typename T> template<typename S1, typename S2, typename IT>
 862 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes, MaskArg mask)
 863 {
 864     IndexSizeChecker<IT, Size>::check();
 865 #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2))
 866     VC_MASKED_GATHER
 867 #undef ith_value
 868 }
 869 template<> template<typename S1, typename IT1, typename IT2>
 870 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
 871 {
 872     IndexSizeChecker<IT1, Size>::check();
 873     IndexSizeChecker<IT2, Size>::check();
 874     d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
 875             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
 876 }
 877 template<> template<typename S1, typename IT1, typename IT2>
 878 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
 879 {
 880     IndexSizeChecker<IT1, Size>::check();
 881     IndexSizeChecker<IT2, Size>::check();
 882     d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
 883             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
 884             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
 885             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
 886 }
 887 template<> template<typename S1, typename IT1, typename IT2>
 888 inline void ALWAYS_INLINE FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
 889 {
 890     IndexSizeChecker<IT1, Size>::check();
 891     IndexSizeChecker<IT2, Size>::check();
 892     d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
 893             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
 894             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
 895             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
 896 }
 897 template<> template<typename S1, typename IT1, typename IT2>
 898 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
 899 {
 900     IndexSizeChecker<IT1, Size>::check();
 901     IndexSizeChecker<IT2, Size>::check();
 902     d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
 903             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
 904             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
 905             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
 906 }
 907 template<> template<typename S1, typename IT1, typename IT2>
 908 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
 909 {
 910     IndexSizeChecker<IT1, Size>::check();
 911     IndexSizeChecker<IT2, Size>::check();
 912     d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
 913             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
 914             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
 915             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
 916 }
 917 template<> template<typename S1, typename IT1, typename IT2>
 918 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
 919 {
 920     IndexSizeChecker<IT1, Size>::check();
 921     IndexSizeChecker<IT2, Size>::check();
 922     d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
 923             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
 924             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
 925             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
 926 }
 927 template<> template<typename S1, typename IT1, typename IT2>
 928 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
 929 {
 930     IndexSizeChecker<IT1, Size>::check();
 931     IndexSizeChecker<IT2, Size>::check();
 932     d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
 933             (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
 934             (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
 935             (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
 936 }
 937 template<typename T> template<typename S1, typename IT1, typename IT2>
 938 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArg mask)
 939 {
 940     IndexSizeChecker<IT1, Size>::check();
 941     IndexSizeChecker<IT2, Size>::check();
 942 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
 943     VC_MASKED_GATHER
 944 #undef ith_value
 945 }
 946
 947 #undef VC_MASKED_GATHER
 948 #ifdef VC_USE_BSF_SCATTERS
 949 #define VC_MASKED_SCATTER                       \
 950     int bits = mask.toInt();                    \
 951     while (bits) {                              \
 952         const int i = _bit_scan_forward(bits);  \
 953         bits ^= (1 << i); /* btr? */            \
 954         ith_value(i) = d.m(i);                  \
 955     }
 956 #elif defined(VC_USE_POPCNT_BSF_SCATTERS)
 957 #define VC_MASKED_SCATTER                       \
 958     unsigned int bits = mask.toInt();           \
 959     unsigned int low, high = 0;                 \
 960     switch (_mm_popcnt_u32(bits)) {             \
 961     case 8:                                     \
 962         high = _bit_scan_reverse(bits);         \
 963         ith_value(high) = d.m(high);            \
 964         high = (1 << high);                     \
 965     case 7:                                     \
 966         low = _bit_scan_forward(bits);          \
 967         bits ^= high | (1 << low);              \
 968         ith_value(low) = d.m(low);              \
 969     case 6:                                     \
 970         high = _bit_scan_reverse(bits);         \
 971         ith_value(high) = d.m(high);            \
 972         high = (1 << high);                     \
 973     case 5:                                     \
 974         low = _bit_scan_forward(bits);          \
 975         bits ^= high | (1 << low);              \
 976         ith_value(low) = d.m(low);              \
 977     case 4:                                     \
 978         high = _bit_scan_reverse(bits);         \
 979         ith_value(high) = d.m(high);            \
 980         high = (1 << high);                     \
 981     case 3:                                     \
 982         low = _bit_scan_forward(bits);          \
 983         bits ^= high | (1 << low);              \
 984         ith_value(low) = d.m(low);              \
 985     case 2:                                     \
 986         high = _bit_scan_reverse(bits);         \
 987         ith_value(high) = d.m(high);            \
 988     case 1:                                     \
 989         low = _bit_scan_forward(bits);          \
 990         ith_value(low) = d.m(low);              \
 991     case 0:                                     \
 992         break;                                  \
 993     }
 994 #else
 995 #define VC_MASKED_SCATTER                       \
 996     if (mask.isEmpty()) {                       \
 997         return;                                 \
 998     }                                           \
 999     for_all_vector_entries(i,                   \
1000             if (mask[i]) ith_value(i) = d.m(i); \
1001             );
1002 #endif
1003
1004 template<typename T> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(EntryType *mem, Index indexes) const
1005 {
1006     for_all_vector_entries(i,
1007             mem[indexes[i]] = d.m(i);
1008             );
1009 }
1010 template<typename T> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(EntryType *mem, Index indexes, MaskArg mask) const
1011 {
1012 #define ith_value(_i_) mem[indexes[_i_]]
1013     VC_MASKED_SCATTER
1014 #undef ith_value
1015 }
1016 template<typename T> template<typename S1, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, IT indexes) const
1017 {
1018     for_all_vector_entries(i,
1019             array[indexes[i]].*(member1) = d.m(i);
1020             );
1021 }
1022 template<typename T> template<typename S1, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, IT indexes, MaskArg mask) const
1023 {
1024 #define ith_value(_i_) array[indexes[_i_]].*(member1)
1025     VC_MASKED_SCATTER
1026 #undef ith_value
1027 }
1028 template<typename T> template<typename S1, typename S2, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, IT indexes) const
1029 {
1030     for_all_vector_entries(i,
1031             array[indexes[i]].*(member1).*(member2) = d.m(i);
1032             );
1033 }
1034 template<typename T> template<typename S1, typename S2, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, IT indexes, MaskArg mask) const
1035 {
1036 #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2)
1037     VC_MASKED_SCATTER
1038 #undef ith_value
1039 }
1040 template<typename T> template<typename S1, typename IT1, typename IT2> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes) const
1041 {
1042     for_all_vector_entries(i,
1043             (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i);
1044             );
1045 }
1046 template<typename T> template<typename S1, typename IT1, typename IT2> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArg mask) const
1047 {
1048 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1049     VC_MASKED_SCATTER
1050 #undef ith_value
1051 }
1052
1053 ///////////////////////////////////////////////////////////////////////////////////////////
1054 // operator- {{{1
1055 template<> inline Vector<double> PURE ALWAYS_INLINE FLATTEN Vector<double>::operator-() const
1056 {
1057     return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd());
1058 }
1059 template<> inline Vector<float> PURE ALWAYS_INLINE FLATTEN Vector<float>::operator-() const
1060 {
1061     return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps());
1062 }
1063 template<> inline Vector<sfloat> PURE ALWAYS_INLINE FLATTEN Vector<sfloat>::operator-() const
1064 {
1065     return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps());
1066 }
1067 template<> inline Vector<int> PURE ALWAYS_INLINE FLATTEN Vector<int>::operator-() const
1068 {
1069     return _mm256_sign_epi32(d.v(), _mm256_setallone_si256());
1070 }
1071 template<> inline Vector<int> PURE ALWAYS_INLINE FLATTEN Vector<unsigned int>::operator-() const
1072 {
1073     return _mm256_sign_epi32(d.v(), _mm256_setallone_si256());
1074 }
1075 template<> inline Vector<short> PURE ALWAYS_INLINE FLATTEN Vector<short>::operator-() const
1076 {
1077     return _mm_sign_epi16(d.v(), _mm_setallone_si128());
1078 }
1079 template<> inline Vector<short> PURE ALWAYS_INLINE FLATTEN Vector<unsigned short>::operator-() const
1080 {
1081     return _mm_sign_epi16(d.v(), _mm_setallone_si128());
1082 }
1083
1084 ///////////////////////////////////////////////////////////////////////////////////////////
1085 // horizontal ops {{{1
1086 template<typename T> inline typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const
1087 {
1088     Vector<T> tmp = std::numeric_limits<Vector<T> >::max();
1089     tmp(m) = *this;
1090     return tmp.min();
1091 }
1092 template<typename T> inline typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const
1093 {
1094     Vector<T> tmp = std::numeric_limits<Vector<T> >::min();
1095     tmp(m) = *this;
1096     return tmp.max();
1097 }
1098 template<typename T> inline typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const
1099 {
1100     Vector<T> tmp(VectorSpecialInitializerOne::One);
1101     tmp(m) = *this;
1102     return tmp.product();
1103 }
1104 template<typename T> inline typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const
1105 {
1106     Vector<T> tmp(VectorSpecialInitializerZero::Zero);
1107     tmp(m) = *this;
1108     return tmp.sum();
1109 }//}}}
1110 // copySign {{{1
1111 template<> inline Vector<float> INTRINSIC Vector<float>::copySign(Vector<float>::AsArg reference) const
1112 {
1113     return _mm256_or_ps(
1114             _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()),
1115             _mm256_and_ps(d.v(), _mm256_setabsmask_ps())
1116             );
1117 }
1118 template<> inline Vector<sfloat> INTRINSIC Vector<sfloat>::copySign(Vector<sfloat>::AsArg reference) const
1119 {
1120     return _mm256_or_ps(
1121             _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()),
1122             _mm256_and_ps(d.v(), _mm256_setabsmask_ps())
1123             );
1124 }
1125 template<> inline Vector<double> INTRINSIC Vector<double>::copySign(Vector<double>::AsArg reference) const
1126 {
1127     return _mm256_or_pd(
1128             _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()),
1129             _mm256_and_pd(d.v(), _mm256_setabsmask_pd())
1130             );
1131 }//}}}1
1132 // exponent {{{1
1133 template<> inline Vector<float> INTRINSIC Vector<float>::exponent() const
1134 {
1135     VC_ASSERT((*this > 0.f).isFull());
1136     __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(d.v()), 23);
1137     __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(d.v())), 23);
1138     tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
1139     tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
1140     return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
1141 }
1142 template<> inline Vector<sfloat> INTRINSIC Vector<sfloat>::exponent() const
1143 {
1144     VC_ASSERT((*this > 0.f).isFull());
1145     __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(d.v()), 23);
1146     __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(d.v())), 23);
1147     tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
1148     tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
1149     return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
1150 }
1151 template<> inline Vector<double> INTRINSIC Vector<double>::exponent() const
1152 {
1153     VC_ASSERT((*this > 0.).isFull());
1154     __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(d.v()), 52);
1155     __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(d.v())), 52);
1156     tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
1157     tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
1158     return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
1159 }
1160 // }}}1
1161 // Random {{{1
1162 static inline ALWAYS_INLINE void _doRandomStep(Vector<unsigned int> &state0,
1163         Vector<unsigned int> &state1)
1164 {
1165     state0.load(&Vc::RandomState[0]);
1166     state1.load(&Vc::RandomState[uint_v::Size]);
1167     (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]);
1168     uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]);
1169 }
1170
1171 template<typename T> inline ALWAYS_INLINE Vector<T> Vector<T>::Random()
1172 {
1173     Vector<unsigned int> state0, state1;
1174     _doRandomStep(state0, state1);
1175     return state0.reinterpretCast<Vector<T> >();
1176 }
1177
1178 template<> inline ALWAYS_INLINE Vector<float> Vector<float>::Random()
1179 {
1180     Vector<unsigned int> state0, state1;
1181     _doRandomStep(state0, state1);
1182     return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1183 }
1184
1185 template<> inline ALWAYS_INLINE Vector<sfloat> Vector<sfloat>::Random()
1186 {
1187     Vector<unsigned int> state0, state1;
1188     _doRandomStep(state0, state1);
1189     return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1190 }
1191
1192 template<> inline ALWAYS_INLINE Vector<double> Vector<double>::Random()
1193 {
1194     const __m256i state = VectorHelper<__m256i>::load(&Vc::RandomState[0], Vc::Aligned);
1195     for (size_t k = 0; k < 8; k += 2) {
1196         typedef unsigned long long uint64 MAY_ALIAS;
1197         const uint64 stateX = *reinterpret_cast<const uint64 *>(&Vc::RandomState[k]);
1198         *reinterpret_cast<uint64 *>(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11);
1199     }
1200     return (Vector<double>(_cast(_mm256_srli_epi64(state, 12))) | One()) - One();
1201 }
1202 // }}}1
1203 } // namespace AVX
1204 } // namespace Vc
1205
1206 #include "undomacros.h"
1207
1208 // vim: foldmethod=marker