1 /* This file is part of the Vc library.
3 Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
26 ALIGN(64) extern unsigned int RandomState[16];
31 ///////////////////////////////////////////////////////////////////////////////////////////
33 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {}
34 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {}
35 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum)
36 : d(HV::load(IndexesFromZeroData<T>::address(), Aligned)) {}
38 template<typename T> inline Vector<T> INTRINSIC CONST Vector<T>::Zero() { return HT::zero(); }
39 template<typename T> inline Vector<T> INTRINSIC CONST Vector<T>::One() { return HT::one(); }
40 template<typename T> inline Vector<T> INTRINSIC CONST Vector<T>::IndexesFromZero() { return HV::load(IndexesFromZeroData<T>::address(), Aligned); }
42 template<typename T> template<typename T2> inline ALWAYS_INLINE Vector<T>::Vector(Vector<T2> x)
43 : d(StaticCastHelper<T2, T>::cast(x.data())) {}
45 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(EntryType x) : d(HT::set(x)) {}
46 template<> inline ALWAYS_INLINE Vector<double>::Vector(EntryType x) : d(_mm256_set1_pd(x)) {}
49 ///////////////////////////////////////////////////////////////////////////////////////////
51 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); }
52 template<typename T> template<typename A> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); }
53 template<typename T> template<typename OtherT> inline ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); }
54 template<typename T> template<typename OtherT, typename A> inline ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); }
56 ///////////////////////////////////////////////////////////////////////////////////////////
57 // load member functions {{{1
58 template<typename T> inline void INTRINSIC Vector<T>::load(const EntryType *mem)
63 template<typename T> template<typename A> inline void INTRINSIC Vector<T>::load(const EntryType *mem, A align)
65 d.v() = HV::load(mem, align);
68 template<typename T> template<typename OtherT> inline void INTRINSIC Vector<T>::load(const OtherT *mem)
74 template<typename DstT, typename SrcT, typename Flags> struct LoadHelper;
77 template<typename Flags> struct LoadHelper<float, double, Flags> {
78 static __m256 load(const double *mem, Flags f)
80 return concat(_mm256_cvtpd_ps(VectorHelper<__m256d>::load(&mem[0], f)),
81 _mm256_cvtpd_ps(VectorHelper<__m256d>::load(&mem[4], f)));
84 template<typename Flags> struct LoadHelper<float, unsigned int, Flags> {
85 static __m256 load(const unsigned int *mem, Flags f)
87 return StaticCastHelper<unsigned int, float>::cast(VectorHelper<__m256i>::load(mem, f));
90 template<typename Flags> struct LoadHelper<float, int, Flags> {
91 static __m256 load(const int *mem, Flags f)
93 return StaticCastHelper<int, float>::cast(VectorHelper<__m256i>::load(mem, f));
96 template<typename Flags> struct LoadHelper<float, unsigned short, Flags> {
97 static __m256 load(const unsigned short *mem, Flags f)
99 return StaticCastHelper<unsigned short, float>::cast(VectorHelper<__m128i>::load(mem, f));
102 template<typename Flags> struct LoadHelper<float, short, Flags> {
103 static __m256 load(const short *mem, Flags f)
105 return StaticCastHelper<short, float>::cast(VectorHelper<__m128i>::load(mem, f));
108 template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
109 static __m256 load(const unsigned char *mem, Flags f)
111 return StaticCastHelper<unsigned int, float>::cast(LoadHelper<unsigned int, unsigned char, Flags>::load(mem, f));
114 template<typename Flags> struct LoadHelper<float, signed char, Flags> {
115 static __m256 load(const signed char *mem, Flags f)
117 return StaticCastHelper<int, float>::cast(LoadHelper<int, signed char, Flags>::load(mem, f));
121 template<typename SrcT, typename Flags> struct LoadHelper<sfloat, SrcT, Flags> : public LoadHelper<float, SrcT, Flags> {};
124 template<typename Flags> struct LoadHelper<int, unsigned int, Flags> {
125 static __m256i load(const unsigned int *mem, Flags f)
127 return VectorHelper<__m256i>::load(mem, f);
130 template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
131 static __m256i load(const unsigned short *mem, Flags f)
133 return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<__m128i>::load(mem, f));
136 template<typename Flags> struct LoadHelper<int, short, Flags> {
137 static __m256i load(const short *mem, Flags f)
139 return StaticCastHelper<short, int>::cast(VectorHelper<__m128i>::load(mem, f));
142 template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
143 static __m256i load(const unsigned char *mem, Flags)
145 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
146 // it, or we risk an out-of-bounds read and an unaligned load exception
147 const __m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
148 const __m128i epu16 = _mm_cvtepu8_epi16(epu8);
149 return StaticCastHelper<unsigned short, unsigned int>::cast(epu16);
152 template<typename Flags> struct LoadHelper<int, signed char, Flags> {
153 static __m256i load(const signed char *mem, Flags)
155 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
156 // it, or we risk an out-of-bounds read and an unaligned load exception
157 const __m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
158 const __m128i epi16 = _mm_cvtepi8_epi16(epi8);
159 return StaticCastHelper<short, int>::cast(epi16);
164 template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
165 static __m256i load(const unsigned short *mem, Flags f)
167 return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<__m128i>::load(mem, f));
170 template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
171 static __m256i load(const unsigned char *mem, Flags)
173 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
174 // it, or we risk an out-of-bounds read and an unaligned load exception
175 const __m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
176 const __m128i epu16 = _mm_cvtepu8_epi16(epu8);
177 return StaticCastHelper<unsigned short, unsigned int>::cast(epu16);
182 template<typename Flags> struct LoadHelper<short, unsigned short, Flags> {
183 static __m128i load(const unsigned short *mem, Flags f)
185 return StaticCastHelper<unsigned short, short>::cast(VectorHelper<__m128i>::load(mem, f));
188 template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
189 static __m128i load(const unsigned char *mem, Flags)
191 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
192 // it, or we risk an out-of-bounds read and an unaligned load exception
193 const __m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
194 return _mm_cvtepu8_epi16(epu8);
197 template<typename Flags> struct LoadHelper<short, signed char, Flags> {
198 static __m128i load(const signed char *mem, Flags)
200 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
201 // it, or we risk an out-of-bounds read and an unaligned load exception
202 const __m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
203 return _mm_cvtepi8_epi16(epi8);
207 // unsigned short {{{2
208 template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
209 static __m128i load(const unsigned char *mem, Flags)
211 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
212 // it, or we risk an out-of-bounds read and an unaligned load exception
213 const __m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
214 return _mm_cvtepu8_epi16(epu8);
218 // general load, implemented via LoadHelper {{{2
219 template<typename DstT> template<typename SrcT, typename Flags> inline void INTRINSIC Vector<DstT>::load(const SrcT *x, Flags f)
221 d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f);
224 ///////////////////////////////////////////////////////////////////////////////////////////
226 template<typename T> inline void INTRINSIC Vector<T>::setZero()
230 template<typename T> inline void INTRINSIC Vector<T>::setZero(const Mask &k)
232 data() = HV::andnot_(avx_cast<VectorType>(k.data()), data());
235 template<> inline void INTRINSIC Vector<double>::setQnan()
237 data() = _mm256_setallone_pd();
239 template<> inline void INTRINSIC Vector<double>::setQnan(MaskArg k)
241 data() = _mm256_or_pd(data(), k.dataD());
243 template<> inline void INTRINSIC Vector<float>::setQnan()
245 data() = _mm256_setallone_ps();
247 template<> inline void INTRINSIC Vector<float>::setQnan(MaskArg k)
249 data() = _mm256_or_ps(data(), k.data());
251 template<> inline void INTRINSIC Vector<sfloat>::setQnan()
253 data() = _mm256_setallone_ps();
255 template<> inline void INTRINSIC Vector<sfloat>::setQnan(MaskArg k)
257 data() = _mm256_or_ps(data(), k.data());
260 ///////////////////////////////////////////////////////////////////////////////////////////
262 template<typename T> inline void INTRINSIC Vector<T>::store(EntryType *mem) const
264 HV::store(mem, data(), Aligned);
266 template<typename T> inline void INTRINSIC Vector<T>::store(EntryType *mem, const Mask &mask) const
268 HV::store(mem, data(), avx_cast<VectorType>(mask.data()), Aligned);
270 template<typename T> template<typename A> inline void INTRINSIC Vector<T>::store(EntryType *mem, A align) const
272 HV::store(mem, data(), align);
274 template<typename T> template<typename A> inline void INTRINSIC Vector<T>::store(EntryType *mem, const Mask &mask, A align) const
276 HV::store(mem, data(), avx_cast<VectorType>(mask.data()), align);
279 ///////////////////////////////////////////////////////////////////////////////////////////
280 // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1
281 template<typename T> inline ALWAYS_INLINE FLATTEN Vector<T>::Vector(const Vector<typename HT::ConcatType> *a)
285 template<> inline ALWAYS_INLINE FLATTEN Vector<float>::Vector(const Vector<HT::ConcatType> *a)
286 : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data())))
289 template<> inline ALWAYS_INLINE FLATTEN Vector<short>::Vector(const Vector<HT::ConcatType> *a)
290 : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data())))
293 template<> inline ALWAYS_INLINE FLATTEN Vector<unsigned short>::Vector(const Vector<HT::ConcatType> *a)
294 : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data())))
297 template<typename T> inline void ALWAYS_INLINE FLATTEN Vector<T>::expand(Vector<typename HT::ConcatType> *x) const
301 template<> inline void ALWAYS_INLINE FLATTEN Vector<float>::expand(Vector<HT::ConcatType> *x) const
303 x[0].data() = _mm256_cvtps_pd(lo128(d.v()));
304 x[1].data() = _mm256_cvtps_pd(hi128(d.v()));
306 template<> inline void ALWAYS_INLINE FLATTEN Vector<short>::expand(Vector<HT::ConcatType> *x) const
308 x[0].data() = concat(_mm_cvtepi16_epi32(d.v()),
309 _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v())));
311 template<> inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::expand(Vector<HT::ConcatType> *x) const
313 x[0].data() = concat(_mm_cvtepu16_epi32(d.v()),
314 _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v())));
317 ///////////////////////////////////////////////////////////////////////////////////////////
319 template<typename T> inline const Vector<T> INTRINSIC CONST &Vector<T>::abcd() const { return *this; }
320 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); }
321 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
322 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); }
323 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); }
324 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); }
325 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); }
326 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); }
327 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); }
328 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); }
329 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); }
330 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); }
331 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); }
333 template<> inline const double_v INTRINSIC CONST Vector<double>::cdab() const { return Mem::shuffle128<X1, X0>(data(), data()); }
334 template<> inline const double_v INTRINSIC CONST Vector<double>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
335 template<> inline const double_v INTRINSIC CONST Vector<double>::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); }
336 template<> inline const double_v INTRINSIC CONST Vector<double>::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); }
337 template<> inline const double_v INTRINSIC CONST Vector<double>::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); }
338 template<> inline const double_v INTRINSIC CONST Vector<double>::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); }
339 template<> inline const double_v INTRINSIC CONST Vector<double>::bcad() const { return Mem::shuffle<X1, Y0, X2, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); }
340 template<> inline const double_v INTRINSIC CONST Vector<double>::bcda() const { return Mem::shuffle<X1, Y0, X3, Y2>(data(), Mem::shuffle128<X1, X0>(data(), data())); }
341 template<> inline const double_v INTRINSIC CONST Vector<double>::dabc() const { return Mem::shuffle<X1, Y0, X3, Y2>(Mem::shuffle128<X1, X0>(data(), data()), data()); }
342 template<> inline const double_v INTRINSIC CONST Vector<double>::acbd() const { return Mem::shuffle<X0, Y0, X3, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); }
343 template<> inline const double_v INTRINSIC CONST Vector<double>::dbca() const { return Mem::shuffle<X1, Y1, X2, Y2>(Mem::shuffle128<X1, X1>(data(), data()), Mem::shuffle128<X0, X0>(data(), data())); }
344 template<> inline const double_v INTRINSIC CONST Vector<double>::dcba() const { return cdab().badc(); }
346 #define VC_SWIZZLES_16BIT_IMPL(T) \
347 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \
348 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \
349 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \
350 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \
351 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \
352 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \
353 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \
354 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \
355 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \
356 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \
357 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \
358 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); }
359 VC_SWIZZLES_16BIT_IMPL(short)
360 VC_SWIZZLES_16BIT_IMPL(unsigned short)
361 #undef VC_SWIZZLES_16BIT_IMPL
363 ///////////////////////////////////////////////////////////////////////////////////////////
365 template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x)
367 if (HasVectorDivision) {
368 return operator/=(Vector<T>(x));
370 for_all_vector_entries(i,
375 template<typename T> template<typename TT> inline PURE VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const
377 if (HasVectorDivision) {
378 return operator/(Vector<T>(x));
381 for_all_vector_entries(i,
382 r.d.m(i) = d.m(i) / x;
386 // per default fall back to scalar division
387 template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x)
389 for_all_vector_entries(i,
395 template<typename T> inline Vector<T> PURE Vector<T>::operator/(const Vector<T> &x) const
398 for_all_vector_entries(i,
399 r.d.m(i) = d.m(i) / x.d.m(i);
403 // specialize division on type
404 static inline __m256i INTRINSIC CONST divInt(__m256i a, __m256i b) {
405 const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
406 const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
407 const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
408 const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
410 _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
411 _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2))
414 template<> inline Vector<int> &Vector<int>::operator/=(const Vector<int> &x)
416 d.v() = divInt(d.v(), x.d.v());
419 template<> inline Vector<int> PURE Vector<int>::operator/(const Vector<int> &x) const
421 return divInt(d.v(), x.d.v());
423 static inline __m256i CONST divUInt(__m256i a, __m256i b) {
424 __m256d loa = _mm256_cvtepi32_pd(lo128(a));
425 __m256d hia = _mm256_cvtepi32_pd(hi128(a));
426 __m256d lob = _mm256_cvtepi32_pd(lo128(b));
427 __m256d hib = _mm256_cvtepi32_pd(hi128(b));
428 // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32)
429 // to get the right number back we have to add 2^32 where a >= 2^31
430 loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.)));
431 hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.)));
432 // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and
433 // we rather want the standard stuff fast
435 // there is one remaining problem: a >= 2^31 and b == 1
436 // in that case the return value would be 2^31
437 return avx_cast<__m256i>(_mm256_blendv_ps(avx_cast<__m256>(concat(
438 _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
439 _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib))
440 )), avx_cast<__m256>(a), avx_cast<__m256>(concat(
441 _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()),
442 _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32())))));
444 template<> inline Vector<unsigned int> ALWAYS_INLINE &Vector<unsigned int>::operator/=(const Vector<unsigned int> &x)
446 d.v() = divUInt(d.v(), x.d.v());
449 template<> inline Vector<unsigned int> ALWAYS_INLINE PURE Vector<unsigned int>::operator/(const Vector<unsigned int> &x) const
451 return divUInt(d.v(), x.d.v());
453 template<typename T> static inline __m128i CONST divShort(__m128i a, __m128i b)
455 const __m256 r = _mm256_div_ps(StaticCastHelper<T, float>::cast(a),
456 StaticCastHelper<T, float>::cast(b));
457 return StaticCastHelper<float, T>::cast(r);
459 template<> inline Vector<short> ALWAYS_INLINE &Vector<short>::operator/=(const Vector<short> &x)
461 d.v() = divShort<short>(d.v(), x.d.v());
464 template<> inline Vector<short> ALWAYS_INLINE PURE Vector<short>::operator/(const Vector<short> &x) const
466 return divShort<short>(d.v(), x.d.v());
468 template<> inline Vector<unsigned short> ALWAYS_INLINE &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x)
470 d.v() = divShort<unsigned short>(d.v(), x.d.v());
473 template<> inline Vector<unsigned short> ALWAYS_INLINE PURE Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const
475 return divShort<unsigned short>(d.v(), x.d.v());
477 template<> inline Vector<float> INTRINSIC &Vector<float>::operator/=(const Vector<float> &x)
479 d.v() = _mm256_div_ps(d.v(), x.d.v());
482 template<> inline Vector<float> INTRINSIC PURE Vector<float>::operator/(const Vector<float> &x) const
484 return _mm256_div_ps(d.v(), x.d.v());
486 template<> inline Vector<double> INTRINSIC &Vector<double>::operator/=(const Vector<double> &x)
488 d.v() = _mm256_div_pd(d.v(), x.d.v());
491 template<> inline Vector<double> INTRINSIC PURE Vector<double>::operator/(const Vector<double> &x) const
493 return _mm256_div_pd(d.v(), x.d.v());
496 ///////////////////////////////////////////////////////////////////////////////////////////
498 #define OP_IMPL(T, symbol) \
499 template<> inline Vector<T> &Vector<T>::operator symbol##=(AsArg x) \
501 for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \
504 template<> inline Vector<T> Vector<T>::operator symbol(AsArg x) const \
507 for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \
512 OP_IMPL(unsigned int, <<)
513 OP_IMPL(unsigned int, >>)
516 OP_IMPL(unsigned short, <<)
517 OP_IMPL(unsigned short, >>)
520 template<typename T> inline Vector<T> &Vector<T>::operator>>=(int shift) {
521 d.v() = VectorHelper<T>::shiftRight(d.v(), shift);
522 return *static_cast<Vector<T> *>(this);
524 template<typename T> inline Vector<T> Vector<T>::operator>>(int shift) const {
525 return VectorHelper<T>::shiftRight(d.v(), shift);
527 template<typename T> inline Vector<T> &Vector<T>::operator<<=(int shift) {
528 d.v() = VectorHelper<T>::shiftLeft(d.v(), shift);
529 return *static_cast<Vector<T> *>(this);
531 template<typename T> inline Vector<T> Vector<T>::operator<<(int shift) const {
532 return VectorHelper<T>::shiftLeft(d.v(), shift);
535 #define OP_IMPL(T, symbol, fun) \
536 template<> inline Vector<T> &Vector<T>::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \
537 template<> inline Vector<T> Vector<T>::operator symbol(AsArg x) const { return Vector<T>(HV::fun(d.v(), x.d.v())); }
538 OP_IMPL(int, &, and_)
540 OP_IMPL(int, ^, xor_)
541 OP_IMPL(unsigned int, &, and_)
542 OP_IMPL(unsigned int, |, or_)
543 OP_IMPL(unsigned int, ^, xor_)
544 OP_IMPL(short, &, and_)
545 OP_IMPL(short, |, or_)
546 OP_IMPL(short, ^, xor_)
547 OP_IMPL(unsigned short, &, and_)
548 OP_IMPL(unsigned short, |, or_)
549 OP_IMPL(unsigned short, ^, xor_)
550 OP_IMPL(float, &, and_)
551 OP_IMPL(float, |, or_)
552 OP_IMPL(float, ^, xor_)
553 OP_IMPL(sfloat, &, and_)
554 OP_IMPL(sfloat, |, or_)
555 OP_IMPL(sfloat, ^, xor_)
556 OP_IMPL(double, &, and_)
557 OP_IMPL(double, |, or_)
558 OP_IMPL(double, ^, xor_)
562 #include "../common/operators.h"
564 // Better implementation (hopefully) with _mm256_set_
565 //X template<typename T> template<typename Index> Vector<T>::Vector(const EntryType *mem, const Index *indexes)
567 //X for_all_vector_entries(int i,
568 //X d.m(i) = mem[indexes[i]];
571 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes)
573 gather(mem, indexes);
575 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, Vector<IndexT> indexes)
577 gather(mem, indexes);
580 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask)
583 gather(mem, indexes, mask);
586 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, Vector<IndexT> indexes, MaskArg mask)
589 gather(mem, indexes, mask);
592 template<typename T> template<typename S1, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, IT indexes)
594 gather(array, member1, indexes);
596 template<typename T> template<typename S1, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, IT indexes, MaskArg mask)
599 gather(array, member1, indexes, mask);
601 template<typename T> template<typename S1, typename S2, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
603 gather(array, member1, member2, indexes);
605 template<typename T> template<typename S1, typename S2, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes, MaskArg mask)
608 gather(array, member1, member2, indexes, mask);
610 template<typename T> template<typename S1, typename IT1, typename IT2> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
612 gather(array, ptrMember1, outerIndexes, innerIndexes);
614 template<typename T> template<typename S1, typename IT1, typename IT2> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArg mask)
617 gather(array, ptrMember1, outerIndexes, innerIndexes, mask);
620 template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} };
621 template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size>
623 static void check() {
624 VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries);
627 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const EntryType *mem, Index indexes)
629 IndexSizeChecker<Index, Size>::check();
630 d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
632 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const EntryType *mem, Index indexes)
634 IndexSizeChecker<Index, Size>::check();
635 d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
636 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
638 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<sfloat>::gather(const EntryType *mem, Index indexes)
640 IndexSizeChecker<Index, Size>::check();
641 d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
642 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
644 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const EntryType *mem, Index indexes)
646 IndexSizeChecker<Index, Size>::check();
647 d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
648 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
650 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const EntryType *mem, Index indexes)
652 IndexSizeChecker<Index, Size>::check();
653 d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
654 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
656 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const EntryType *mem, Index indexes)
658 IndexSizeChecker<Index, Size>::check();
659 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
660 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
662 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const EntryType *mem, Index indexes)
664 IndexSizeChecker<Index, Size>::check();
665 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
666 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
669 #ifdef VC_USE_SET_GATHERS
670 template<typename T> template<typename IT> inline void ALWAYS_INLINE Vector<T>::gather(const EntryType *mem, Vector<IT> indexes, MaskArg mask)
672 IndexSizeChecker<Vector<IT>, Size>::check();
673 indexes.setZero(!mask);
674 (*this)(mask) = Vector<T>(mem, indexes);
678 #ifdef VC_USE_BSF_GATHERS
679 #define VC_MASKED_GATHER \
680 int bits = mask.toInt(); \
682 const int i = _bit_scan_forward(bits); \
683 bits &= ~(1 << i); /* btr? */ \
684 d.m(i) = ith_value(i); \
686 #elif defined(VC_USE_POPCNT_BSF_GATHERS)
687 #define VC_MASKED_GATHER \
688 unsigned int bits = mask.toInt(); \
689 unsigned int low, high = 0; \
690 switch (_mm_popcnt_u32(bits)) { \
692 high = _bit_scan_reverse(bits); \
693 d.m(high) = ith_value(high); \
694 high = (1 << high); \
696 low = _bit_scan_forward(bits); \
697 bits ^= high | (1 << low); \
698 d.m(low) = ith_value(low); \
700 high = _bit_scan_reverse(bits); \
701 d.m(high) = ith_value(high); \
702 high = (1 << high); \
704 low = _bit_scan_forward(bits); \
705 bits ^= high | (1 << low); \
706 d.m(low) = ith_value(low); \
708 high = _bit_scan_reverse(bits); \
709 d.m(high) = ith_value(high); \
710 high = (1 << high); \
712 low = _bit_scan_forward(bits); \
713 bits ^= high | (1 << low); \
714 d.m(low) = ith_value(low); \
716 high = _bit_scan_reverse(bits); \
717 d.m(high) = ith_value(high); \
719 low = _bit_scan_forward(bits); \
720 d.m(low) = ith_value(low); \
725 #define VC_MASKED_GATHER \
726 if (mask.isEmpty()) { \
729 for_all_vector_entries(i, \
730 if (mask[i]) d.m(i) = ith_value(i); \
734 template<typename T> template<typename Index>
735 inline void INTRINSIC Vector<T>::gather(const EntryType *mem, Index indexes, MaskArg mask)
737 IndexSizeChecker<Index, Size>::check();
738 #define ith_value(_i_) (mem[indexes[_i_]])
743 template<> template<typename S1, typename IT>
744 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
746 IndexSizeChecker<IT, Size>::check();
747 d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1),
748 array[indexes[2]].*(member1), array[indexes[3]].*(member1));
750 template<> template<typename S1, typename IT>
751 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
753 IndexSizeChecker<IT, Size>::check();
754 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
755 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
756 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
758 template<> template<typename S1, typename IT>
759 inline void ALWAYS_INLINE FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
761 IndexSizeChecker<IT, Size>::check();
762 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
763 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
764 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
766 template<> template<typename S1, typename IT>
767 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
769 IndexSizeChecker<IT, Size>::check();
770 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
771 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
772 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
774 template<> template<typename S1, typename IT>
775 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
777 IndexSizeChecker<IT, Size>::check();
778 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
779 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
780 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
782 template<> template<typename S1, typename IT>
783 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
785 IndexSizeChecker<IT, Size>::check();
786 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
787 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
788 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
790 template<> template<typename S1, typename IT>
791 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, IT indexes)
793 IndexSizeChecker<IT, Size>::check();
794 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
795 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
796 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
798 template<typename T> template<typename S1, typename IT>
799 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, IT indexes, MaskArg mask)
801 IndexSizeChecker<IT, Size>::check();
802 #define ith_value(_i_) (array[indexes[_i_]].*(member1))
806 template<> template<typename S1, typename S2, typename IT>
807 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
809 IndexSizeChecker<IT, Size>::check();
810 d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
811 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
813 template<> template<typename S1, typename S2, typename IT>
814 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
816 IndexSizeChecker<IT, Size>::check();
817 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
818 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
819 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
821 template<> template<typename S1, typename S2, typename IT>
822 inline void ALWAYS_INLINE FLATTEN Vector<sfloat>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
824 IndexSizeChecker<IT, Size>::check();
825 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
826 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
827 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
829 template<> template<typename S1, typename S2, typename IT>
830 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
832 IndexSizeChecker<IT, Size>::check();
833 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
834 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
835 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
837 template<> template<typename S1, typename S2, typename IT>
838 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
840 IndexSizeChecker<IT, Size>::check();
841 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
842 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
843 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
845 template<> template<typename S1, typename S2, typename IT>
846 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
848 IndexSizeChecker<IT, Size>::check();
849 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
850 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
851 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
853 template<> template<typename S1, typename S2, typename IT>
854 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes)
856 IndexSizeChecker<IT, Size>::check();
857 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
858 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
859 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
861 template<typename T> template<typename S1, typename S2, typename IT>
862 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, IT indexes, MaskArg mask)
864 IndexSizeChecker<IT, Size>::check();
865 #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2))
869 template<> template<typename S1, typename IT1, typename IT2>
870 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
872 IndexSizeChecker<IT1, Size>::check();
873 IndexSizeChecker<IT2, Size>::check();
874 d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
875 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
877 template<> template<typename S1, typename IT1, typename IT2>
878 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
880 IndexSizeChecker<IT1, Size>::check();
881 IndexSizeChecker<IT2, Size>::check();
882 d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
883 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
884 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
885 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
887 template<> template<typename S1, typename IT1, typename IT2>
888 inline void ALWAYS_INLINE FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
890 IndexSizeChecker<IT1, Size>::check();
891 IndexSizeChecker<IT2, Size>::check();
892 d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
893 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
894 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
895 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
897 template<> template<typename S1, typename IT1, typename IT2>
898 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
900 IndexSizeChecker<IT1, Size>::check();
901 IndexSizeChecker<IT2, Size>::check();
902 d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
903 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
904 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
905 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
907 template<> template<typename S1, typename IT1, typename IT2>
908 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
910 IndexSizeChecker<IT1, Size>::check();
911 IndexSizeChecker<IT2, Size>::check();
912 d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
913 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
914 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
915 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
917 template<> template<typename S1, typename IT1, typename IT2>
918 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
920 IndexSizeChecker<IT1, Size>::check();
921 IndexSizeChecker<IT2, Size>::check();
922 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
923 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
924 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
925 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
927 template<> template<typename S1, typename IT1, typename IT2>
928 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes)
930 IndexSizeChecker<IT1, Size>::check();
931 IndexSizeChecker<IT2, Size>::check();
932 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
933 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
934 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
935 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
937 template<typename T> template<typename S1, typename IT1, typename IT2>
938 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArg mask)
940 IndexSizeChecker<IT1, Size>::check();
941 IndexSizeChecker<IT2, Size>::check();
942 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
947 #undef VC_MASKED_GATHER
948 #ifdef VC_USE_BSF_SCATTERS
949 #define VC_MASKED_SCATTER \
950 int bits = mask.toInt(); \
952 const int i = _bit_scan_forward(bits); \
953 bits ^= (1 << i); /* btr? */ \
954 ith_value(i) = d.m(i); \
956 #elif defined(VC_USE_POPCNT_BSF_SCATTERS)
957 #define VC_MASKED_SCATTER \
958 unsigned int bits = mask.toInt(); \
959 unsigned int low, high = 0; \
960 switch (_mm_popcnt_u32(bits)) { \
962 high = _bit_scan_reverse(bits); \
963 ith_value(high) = d.m(high); \
964 high = (1 << high); \
966 low = _bit_scan_forward(bits); \
967 bits ^= high | (1 << low); \
968 ith_value(low) = d.m(low); \
970 high = _bit_scan_reverse(bits); \
971 ith_value(high) = d.m(high); \
972 high = (1 << high); \
974 low = _bit_scan_forward(bits); \
975 bits ^= high | (1 << low); \
976 ith_value(low) = d.m(low); \
978 high = _bit_scan_reverse(bits); \
979 ith_value(high) = d.m(high); \
980 high = (1 << high); \
982 low = _bit_scan_forward(bits); \
983 bits ^= high | (1 << low); \
984 ith_value(low) = d.m(low); \
986 high = _bit_scan_reverse(bits); \
987 ith_value(high) = d.m(high); \
989 low = _bit_scan_forward(bits); \
990 ith_value(low) = d.m(low); \
995 #define VC_MASKED_SCATTER \
996 if (mask.isEmpty()) { \
999 for_all_vector_entries(i, \
1000 if (mask[i]) ith_value(i) = d.m(i); \
1004 template<typename T> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(EntryType *mem, Index indexes) const
1006 for_all_vector_entries(i,
1007 mem[indexes[i]] = d.m(i);
1010 template<typename T> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(EntryType *mem, Index indexes, MaskArg mask) const
1012 #define ith_value(_i_) mem[indexes[_i_]]
1016 template<typename T> template<typename S1, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, IT indexes) const
1018 for_all_vector_entries(i,
1019 array[indexes[i]].*(member1) = d.m(i);
1022 template<typename T> template<typename S1, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, IT indexes, MaskArg mask) const
1024 #define ith_value(_i_) array[indexes[_i_]].*(member1)
1028 template<typename T> template<typename S1, typename S2, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, IT indexes) const
1030 for_all_vector_entries(i,
1031 array[indexes[i]].*(member1).*(member2) = d.m(i);
1034 template<typename T> template<typename S1, typename S2, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, IT indexes, MaskArg mask) const
1036 #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2)
1040 template<typename T> template<typename S1, typename IT1, typename IT2> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes) const
1042 for_all_vector_entries(i,
1043 (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i);
1046 template<typename T> template<typename S1, typename IT1, typename IT2> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, IT1 outerIndexes, IT2 innerIndexes, MaskArg mask) const
1048 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1053 ///////////////////////////////////////////////////////////////////////////////////////////
1055 template<> inline Vector<double> PURE ALWAYS_INLINE FLATTEN Vector<double>::operator-() const
1057 return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd());
1059 template<> inline Vector<float> PURE ALWAYS_INLINE FLATTEN Vector<float>::operator-() const
1061 return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps());
1063 template<> inline Vector<sfloat> PURE ALWAYS_INLINE FLATTEN Vector<sfloat>::operator-() const
1065 return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps());
1067 template<> inline Vector<int> PURE ALWAYS_INLINE FLATTEN Vector<int>::operator-() const
1069 return _mm256_sign_epi32(d.v(), _mm256_setallone_si256());
1071 template<> inline Vector<int> PURE ALWAYS_INLINE FLATTEN Vector<unsigned int>::operator-() const
1073 return _mm256_sign_epi32(d.v(), _mm256_setallone_si256());
1075 template<> inline Vector<short> PURE ALWAYS_INLINE FLATTEN Vector<short>::operator-() const
1077 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
1079 template<> inline Vector<short> PURE ALWAYS_INLINE FLATTEN Vector<unsigned short>::operator-() const
1081 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
1084 ///////////////////////////////////////////////////////////////////////////////////////////
1085 // horizontal ops {{{1
1086 template<typename T> inline typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const
1088 Vector<T> tmp = std::numeric_limits<Vector<T> >::max();
1092 template<typename T> inline typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const
1094 Vector<T> tmp = std::numeric_limits<Vector<T> >::min();
1098 template<typename T> inline typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const
1100 Vector<T> tmp(VectorSpecialInitializerOne::One);
1102 return tmp.product();
1104 template<typename T> inline typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const
1106 Vector<T> tmp(VectorSpecialInitializerZero::Zero);
1111 template<> inline Vector<float> INTRINSIC Vector<float>::copySign(Vector<float>::AsArg reference) const
1113 return _mm256_or_ps(
1114 _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()),
1115 _mm256_and_ps(d.v(), _mm256_setabsmask_ps())
1118 template<> inline Vector<sfloat> INTRINSIC Vector<sfloat>::copySign(Vector<sfloat>::AsArg reference) const
1120 return _mm256_or_ps(
1121 _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()),
1122 _mm256_and_ps(d.v(), _mm256_setabsmask_ps())
1125 template<> inline Vector<double> INTRINSIC Vector<double>::copySign(Vector<double>::AsArg reference) const
1127 return _mm256_or_pd(
1128 _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()),
1129 _mm256_and_pd(d.v(), _mm256_setabsmask_pd())
1133 template<> inline Vector<float> INTRINSIC Vector<float>::exponent() const
1135 VC_ASSERT((*this > 0.f).isFull());
1136 __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(d.v()), 23);
1137 __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(d.v())), 23);
1138 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
1139 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
1140 return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
1142 template<> inline Vector<sfloat> INTRINSIC Vector<sfloat>::exponent() const
1144 VC_ASSERT((*this > 0.f).isFull());
1145 __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(d.v()), 23);
1146 __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(d.v())), 23);
1147 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
1148 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
1149 return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
1151 template<> inline Vector<double> INTRINSIC Vector<double>::exponent() const
1153 VC_ASSERT((*this > 0.).isFull());
1154 __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(d.v()), 52);
1155 __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(d.v())), 52);
1156 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
1157 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
1158 return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
1162 static inline ALWAYS_INLINE void _doRandomStep(Vector<unsigned int> &state0,
1163 Vector<unsigned int> &state1)
1165 state0.load(&Vc::RandomState[0]);
1166 state1.load(&Vc::RandomState[uint_v::Size]);
1167 (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]);
1168 uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]);
1171 template<typename T> inline ALWAYS_INLINE Vector<T> Vector<T>::Random()
1173 Vector<unsigned int> state0, state1;
1174 _doRandomStep(state0, state1);
1175 return state0.reinterpretCast<Vector<T> >();
1178 template<> inline ALWAYS_INLINE Vector<float> Vector<float>::Random()
1180 Vector<unsigned int> state0, state1;
1181 _doRandomStep(state0, state1);
1182 return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1185 template<> inline ALWAYS_INLINE Vector<sfloat> Vector<sfloat>::Random()
1187 Vector<unsigned int> state0, state1;
1188 _doRandomStep(state0, state1);
1189 return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1192 template<> inline ALWAYS_INLINE Vector<double> Vector<double>::Random()
1194 const __m256i state = VectorHelper<__m256i>::load(&Vc::RandomState[0], Vc::Aligned);
1195 for (size_t k = 0; k < 8; k += 2) {
1196 typedef unsigned long long uint64 MAY_ALIAS;
1197 const uint64 stateX = *reinterpret_cast<const uint64 *>(&Vc::RandomState[k]);
1198 *reinterpret_cast<uint64 *>(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11);
1200 return (Vector<double>(_cast(_mm256_srli_epi64(state, 12))) | One()) - One();
1206 #include "undomacros.h"
1208 // vim: foldmethod=marker