1 /* This file is part of the Vc library.
3 Copyright (C) 2010-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
21 #include "../common/bitscanintrinsics.h"
26 ALIGN(64) extern unsigned int RandomState[16];
31 template<typename T, int Size> static inline const T *_IndexesFromZero() {
33 return reinterpret_cast<const T *>(_IndexesFromZero4);
34 } else if (Size == 8) {
35 return reinterpret_cast<const T *>(_IndexesFromZero8);
36 } else if (Size == 16) {
37 return reinterpret_cast<const T *>(_IndexesFromZero16);
42 ///////////////////////////////////////////////////////////////////////////////////////////
44 template<typename T> inline Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum)
45 : d(VectorHelper<VectorType>::zero())
49 template<typename T> inline Vector<T>::Vector(VectorSpecialInitializerOne::OEnum)
50 : d(VectorHelper<T>::one())
54 template<typename T> inline Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum)
55 : d(VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned))
59 template<typename T> inline Vector<T> Vector<T>::Zero()
61 return VectorHelper<VectorType>::zero();
64 template<typename T> inline Vector<T> Vector<T>::One()
66 return VectorHelper<T>::one();
69 template<typename T> inline Vector<T> Vector<T>::IndexesFromZero()
71 return VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned);
74 // conversion/casts {{{1
75 template<typename T> template<typename OtherT> inline INTRINSIC Vector<T>::Vector(const Vector<OtherT> &x)
76 : d(StaticCastHelper<OtherT, T>::cast(x.data()))
80 template<> template<> inline INTRINSIC short_v &Vector<short>::operator=(const ushort_v &x) {
81 data() = StaticCastHelper<unsigned short, short>::cast(x.data()); return *this;
83 template<> template<> inline INTRINSIC ushort_v &Vector<unsigned short>::operator=(const short_v &x) {
84 data() = StaticCastHelper<short, unsigned short>::cast(x.data()); return *this;
86 template<> template<> inline INTRINSIC int_v &Vector<int>::operator=(const uint_v &x) {
87 data() = StaticCastHelper<unsigned int, int>::cast(x.data()); return *this;
89 template<> template<> inline INTRINSIC uint_v &Vector<unsigned int>::operator=(const int_v &x) {
90 data() = StaticCastHelper<int, unsigned int>::cast(x.data()); return *this;
94 template<typename T> inline Vector<T>::Vector(EntryType a)
95 : d(VectorHelper<T>::set(a))
99 ///////////////////////////////////////////////////////////////////////////////////////////
101 template<typename T> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); }
102 template<typename T> template<typename A> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); }
103 template<typename T> template<typename OtherT> inline ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); }
104 template<typename T> template<typename OtherT, typename A> inline ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); }
106 ///////////////////////////////////////////////////////////////////////////////////////////
107 // load member functions {{{1
108 template<typename T> inline void INTRINSIC Vector<T>::load(const EntryType *mem)
113 template<typename T> template<typename A> inline void INTRINSIC Vector<T>::load(const EntryType *mem, A align)
115 d.v() = VectorHelper<VectorType>::load(mem, align);
118 template<typename T> template<typename OtherT> inline void INTRINSIC Vector<T>::load(const OtherT *mem)
123 // float8: simply use the float implementation twice {{{2
124 template<> template<typename OtherT, typename A> inline void INTRINSIC Vector<float8>::load(const OtherT *x, A a)
126 d.v() = M256::create(
127 Vector<float>(&x[0], a).data(),
128 Vector<float>(&x[4], a).data()
133 template<typename DstT, typename SrcT, typename Flags> struct LoadHelper;
136 template<typename Flags> struct LoadHelper<float, double, Flags> {
137 static inline __m128 load(const double *mem, Flags f)
139 return _mm_movelh_ps(_mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[0], f)),
140 _mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[2], f)));
143 template<typename Flags> struct LoadHelper<float, unsigned int, Flags> {
144 static inline __m128 load(const unsigned int *mem, Flags f)
146 return StaticCastHelper<unsigned int, float>::cast(VectorHelper<__m128i>::load(mem, f));
149 template<typename Flags> struct LoadHelper<float, int, Flags> {
150 static inline __m128 load(const int *mem, Flags f)
152 return StaticCastHelper<int, float>::cast(VectorHelper<__m128i>::load(mem, f));
155 template<typename Flags> struct LoadHelper<float, unsigned short, Flags> {
156 static inline __m128 load(const unsigned short *mem, Flags f)
158 return _mm_cvtepi32_ps(LoadHelper<int, unsigned short, Flags>::load(mem, f));
161 template<typename Flags> struct LoadHelper<float, short, Flags> {
162 static inline __m128 load(const short *mem, Flags f)
164 return _mm_cvtepi32_ps(LoadHelper<int, short, Flags>::load(mem, f));
167 template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
168 static inline __m128 load(const unsigned char *mem, Flags f)
170 return _mm_cvtepi32_ps(LoadHelper<int, unsigned char, Flags>::load(mem, f));
173 template<typename Flags> struct LoadHelper<float, signed char, Flags> {
174 static inline __m128 load(const signed char *mem, Flags f)
176 return _mm_cvtepi32_ps(LoadHelper<int, signed char, Flags>::load(mem, f));
181 template<typename Flags> struct LoadHelper<int, unsigned int, Flags> {
182 static inline __m128i load(const unsigned int *mem, Flags f)
184 return VectorHelper<__m128i>::load(mem, f);
187 // no difference between streaming and alignment, because the
188 // 32/64 bit loads are not available as streaming loads, and can always be unaligned
189 template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
190 static inline __m128i load(const unsigned short *mem, Flags)
192 return _mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
195 template<typename Flags> struct LoadHelper<int, short, Flags> {
196 static inline __m128i load(const short *mem, Flags)
198 return _mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
201 template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
202 static inline __m128i load(const unsigned char *mem, Flags)
204 return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
207 template<typename Flags> struct LoadHelper<int, signed char, Flags> {
208 static inline __m128i load(const signed char *mem, Flags)
210 return _mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
215 template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
216 static inline __m128i load(const unsigned short *mem, Flags)
218 return _mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
221 template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
222 static inline __m128i load(const unsigned char *mem, Flags)
224 return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
229 template<typename Flags> struct LoadHelper<short, unsigned short, Flags> {
230 static inline __m128i load(const unsigned short *mem, Flags f)
232 return VectorHelper<__m128i>::load(mem, f);
235 template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
236 static inline __m128i load(const unsigned char *mem, Flags)
238 return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
241 template<typename Flags> struct LoadHelper<short, signed char, Flags> {
242 static inline __m128i load(const signed char *mem, Flags)
244 return _mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
248 // unsigned short {{{2
249 template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
250 static inline __m128i load(const unsigned char *mem, Flags)
252 return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
256 // general load, implemented via LoadHelper {{{2
257 template<typename DstT> template<typename SrcT, typename Flags> inline void INTRINSIC Vector<DstT>::load(const SrcT *x, Flags f)
259 d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f);
262 ///////////////////////////////////////////////////////////////////////////////////////////
263 // expand/combine {{{1
264 template<typename T> inline Vector<T>::Vector(const Vector<typename CtorTypeHelper<T>::Type> *a)
265 : d(VectorHelper<T>::concat(a[0].data(), a[1].data()))
269 template<typename T> inline void Vector<T>::expand(Vector<typename ExpandTypeHelper<T>::Type> *x) const
272 x[0].data() = VectorHelper<T>::expand0(data());
273 x[1].data() = VectorHelper<T>::expand1(data());
277 ///////////////////////////////////////////////////////////////////////////////////////////
279 template<typename T> inline void Vector<T>::setZero()
281 data() = VectorHelper<VectorType>::zero();
284 template<typename T> inline void Vector<T>::setZero(const Mask &k)
286 data() = VectorHelper<VectorType>::andnot_(mm128_reinterpret_cast<VectorType>(k.data()), data());
289 template<> inline void INTRINSIC Vector<double>::setQnan()
291 data() = _mm_setallone_pd();
293 template<> inline void INTRINSIC Vector<double>::setQnan(Mask::Argument k)
295 data() = _mm_or_pd(data(), k.dataD());
297 template<> inline void INTRINSIC Vector<float>::setQnan()
299 data() = _mm_setallone_ps();
301 template<> inline void INTRINSIC Vector<float>::setQnan(Mask::Argument k)
303 data() = _mm_or_ps(data(), k.data());
305 template<> inline void INTRINSIC Vector<float8>::setQnan()
307 d.v()[0] = _mm_setallone_ps();
308 d.v()[1] = _mm_setallone_ps();
310 template<> inline void INTRINSIC Vector<float8>::setQnan(Mask::Argument k)
312 d.v()[0] = _mm_or_ps(d.v()[0], k.data()[0]);
313 d.v()[1] = _mm_or_ps(d.v()[1], k.data()[1]);
316 ///////////////////////////////////////////////////////////////////////////////////////////
318 template<typename T> inline void Vector<T>::store(EntryType *mem) const
320 VectorHelper<VectorType>::store(mem, data(), Aligned);
323 template<typename T> inline void Vector<T>::store(EntryType *mem, const Mask &mask) const
325 VectorHelper<VectorType>::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), Aligned);
328 template<typename T> template<typename A> inline void Vector<T>::store(EntryType *mem, A align) const
330 VectorHelper<VectorType>::store(mem, data(), align);
333 template<typename T> template<typename A> inline void Vector<T>::store(EntryType *mem, const Mask &mask, A align) const
335 HV::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), align);
338 ///////////////////////////////////////////////////////////////////////////////////////////
340 template<typename T> inline INTRINSIC CONST Vector<T> &WriteMaskedVector<T>::operator/=(const Vector<T> &x)
342 return operator=(*vec / x);
344 template<> inline INTRINSIC CONST int_v &WriteMaskedVector<int>::operator/=(const int_v &x)
346 Vc_foreach_bit (int i, mask) {
347 vec->d.m(i) /= x.d.m(i);
351 template<> inline INTRINSIC CONST uint_v &WriteMaskedVector<unsigned int>::operator/=(const uint_v &x)
353 Vc_foreach_bit (int i, mask) {
354 vec->d.m(i) /= x.d.m(i);
358 template<> inline INTRINSIC CONST short_v &WriteMaskedVector<short>::operator/=(const short_v &x)
360 Vc_foreach_bit (int i, mask) {
361 vec->d.m(i) /= x.d.m(i);
365 template<> inline INTRINSIC CONST ushort_v &WriteMaskedVector<unsigned short>::operator/=(const ushort_v &x)
367 Vc_foreach_bit (int i, mask) {
368 vec->d.m(i) /= x.d.m(i);
373 template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x)
375 if (VectorTraits<T>::HasVectorDivision) {
376 return operator/=(Vector<T>(x));
378 for_all_vector_entries(i,
384 template<typename T> template<typename TT> inline PURE INTRINSIC VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const
386 if (VectorTraits<T>::HasVectorDivision) {
387 return operator/(Vector<T>(x));
390 for_all_vector_entries(i,
391 r.d.m(i) = d.m(i) / x;
396 template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x)
398 for_all_vector_entries(i,
404 template<typename T> inline Vector<T> Vector<T>::operator/(const Vector<T> &x) const
407 for_all_vector_entries(i,
408 r.d.m(i) = d.m(i) / x.d.m(i);
413 template<> inline Vector<short> &Vector<short>::operator/=(const Vector<short> &x)
415 __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
416 __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
417 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
418 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
419 d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
423 template<> inline Vector<short> ALWAYS_INLINE Vector<short>::operator/(const Vector<short> &x) const
425 __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
426 __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
427 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
428 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
429 return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
432 template<> inline Vector<unsigned short> &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x)
434 __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
435 __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
436 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
437 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
438 d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
442 template<> inline Vector<unsigned short> ALWAYS_INLINE Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const
444 __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
445 __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
446 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
447 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
448 return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
451 template<> inline Vector<float> &Vector<float>::operator/=(const Vector<float> &x)
453 d.v() = _mm_div_ps(d.v(), x.d.v());
457 template<> inline Vector<float> Vector<float>::operator/(const Vector<float> &x) const
459 return _mm_div_ps(d.v(), x.d.v());
462 template<> inline Vector<float8> &Vector<float8>::operator/=(const Vector<float8> &x)
464 d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]);
465 d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]);
469 template<> inline Vector<float8> Vector<float8>::operator/(const Vector<float8> &x) const
472 r.d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]);
473 r.d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]);
477 template<> inline Vector<double> &Vector<double>::operator/=(const Vector<double> &x)
479 d.v() = _mm_div_pd(d.v(), x.d.v());
483 template<> inline Vector<double> Vector<double>::operator/(const Vector<double> &x) const
485 return _mm_div_pd(d.v(), x.d.v());
488 ///////////////////////////////////////////////////////////////////////////////////////////
490 template<> inline Vector<double> PURE ALWAYS_INLINE FLATTEN Vector<double>::operator-() const
492 return _mm_xor_pd(d.v(), _mm_setsignmask_pd());
494 template<> inline Vector<float> PURE ALWAYS_INLINE FLATTEN Vector<float>::operator-() const
496 return _mm_xor_ps(d.v(), _mm_setsignmask_ps());
498 template<> inline Vector<float8> PURE ALWAYS_INLINE FLATTEN Vector<float8>::operator-() const
501 _mm_xor_ps(d.v()[0], _mm_setsignmask_ps()),
502 _mm_xor_ps(d.v()[1], _mm_setsignmask_ps()));
504 template<> inline Vector<int> PURE ALWAYS_INLINE FLATTEN Vector<int>::operator-() const
507 return _mm_sign_epi32(d.v(), _mm_setallone_si128());
509 return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32());
512 template<> inline Vector<int> PURE ALWAYS_INLINE FLATTEN Vector<unsigned int>::operator-() const
515 return _mm_sign_epi32(d.v(), _mm_setallone_si128());
517 return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32());
520 template<> inline Vector<short> PURE ALWAYS_INLINE FLATTEN Vector<short>::operator-() const
523 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
525 return _mm_mullo_epi16(d.v(), _mm_setallone_si128());
528 template<> inline Vector<short> PURE ALWAYS_INLINE FLATTEN Vector<unsigned short>::operator-() const
531 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
533 return _mm_mullo_epi16(d.v(), _mm_setallone_si128());
537 ///////////////////////////////////////////////////////////////////////////////////////////
539 #define OP_IMPL(T, symbol, fun) \
540 template<> inline Vector<T> &Vector<T>::operator symbol##=(const Vector<T> &x) \
542 d.v() = VectorHelper<T>::fun(d.v(), x.d.v()); \
545 template<> inline Vector<T> Vector<T>::operator symbol(const Vector<T> &x) const \
547 return VectorHelper<T>::fun(d.v(), x.d.v()); \
549 OP_IMPL(int, &, and_)
551 OP_IMPL(int, ^, xor_)
552 OP_IMPL(unsigned int, &, and_)
553 OP_IMPL(unsigned int, |, or_)
554 OP_IMPL(unsigned int, ^, xor_)
555 OP_IMPL(short, &, and_)
556 OP_IMPL(short, |, or_)
557 OP_IMPL(short, ^, xor_)
558 OP_IMPL(unsigned short, &, and_)
559 OP_IMPL(unsigned short, |, or_)
560 OP_IMPL(unsigned short, ^, xor_)
561 OP_IMPL(float, &, and_)
562 OP_IMPL(float, |, or_)
563 OP_IMPL(float, ^, xor_)
564 OP_IMPL(float8, &, and_)
565 OP_IMPL(float8, |, or_)
566 OP_IMPL(float8, ^, xor_)
567 OP_IMPL(double, &, and_)
568 OP_IMPL(double, |, or_)
569 OP_IMPL(double, ^, xor_)
573 static inline INTRINSIC CONST __m128i shiftLeft (const int_v &value, const int_v &count) { return _mm_sha_epi32(value.data(), count.data()); }
574 static inline INTRINSIC CONST __m128i shiftLeft (const uint_v &value, const uint_v &count) { return _mm_shl_epi32(value.data(), count.data()); }
575 static inline INTRINSIC CONST __m128i shiftLeft (const short_v &value, const short_v &count) { return _mm_sha_epi16(value.data(), count.data()); }
576 static inline INTRINSIC CONST __m128i shiftLeft (const ushort_v &value, const ushort_v &count) { return _mm_shl_epi16(value.data(), count.data()); }
577 static inline INTRINSIC CONST __m128i shiftRight(const int_v &value, const int_v &count) { return shiftLeft(value, -count ); }
578 static inline INTRINSIC CONST __m128i shiftRight(const uint_v &value, const uint_v &count) { return shiftLeft(value, uint_v(-count)); }
579 static inline INTRINSIC CONST __m128i shiftRight(const short_v &value, const short_v &count) { return shiftLeft(value, -count ); }
580 static inline INTRINSIC CONST __m128i shiftRight(const ushort_v &value, const ushort_v &count) { return shiftLeft(value, ushort_v(-count)); }
582 #define _VC_OP(T, symbol, impl) \
583 template<> inline INTRINSIC T &T::operator symbol##=(T::AsArg shift) \
585 d.v() = impl(*this, shift); \
588 template<> inline INTRINSIC T T::operator symbol (T::AsArg shift) const \
590 return impl(*this, shift); \
592 VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, <<, shiftLeft)
593 VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, >>, shiftRight)
596 #if defined(VC_GCC) && VC_GCC == 0x40600 && VC_IMPL_XOP
597 #define VC_WORKAROUND_IN
598 #define VC_WORKAROUND __attribute__((optimize("no-tree-vectorize"),weak))
600 #define VC_WORKAROUND_IN inline
601 #define VC_WORKAROUND INTRINSIC
604 #define OP_IMPL(T, symbol) \
605 template<> VC_WORKAROUND_IN Vector<T> VC_WORKAROUND &Vector<T>::operator symbol##=(Vector<T>::AsArg x) \
607 for_all_vector_entries(i, \
608 d.m(i) symbol##= x.d.m(i); \
612 template<> inline Vector<T> Vector<T>::operator symbol(Vector<T>::AsArg x) const \
615 for_all_vector_entries(i, \
616 r.d.m(i) = d.m(i) symbol x.d.m(i); \
622 OP_IMPL(unsigned int, <<)
623 OP_IMPL(unsigned int, >>)
626 OP_IMPL(unsigned short, <<)
627 OP_IMPL(unsigned short, >>)
630 #undef VC_WORKAROUND_IN
633 template<typename T> inline Vector<T> &Vector<T>::operator>>=(int shift) {
634 d.v() = VectorHelper<T>::shiftRight(d.v(), shift);
637 template<typename T> inline Vector<T> Vector<T>::operator>>(int shift) const {
638 return VectorHelper<T>::shiftRight(d.v(), shift);
640 template<typename T> inline Vector<T> &Vector<T>::operator<<=(int shift) {
641 d.v() = VectorHelper<T>::shiftLeft(d.v(), shift);
644 template<typename T> inline Vector<T> Vector<T>::operator<<(int shift) const {
645 return VectorHelper<T>::shiftLeft(d.v(), shift);
648 ///////////////////////////////////////////////////////////////////////////////////////////
650 template<typename T> inline const Vector<T> INTRINSIC CONST &Vector<T>::abcd() const { return *this; }
651 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); }
652 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
653 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); }
654 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); }
655 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); }
656 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); }
657 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); }
658 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); }
659 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); }
660 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); }
661 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); }
662 template<typename T> inline const Vector<T> INTRINSIC CONST Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); }
664 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::cdab() const { return M256::create(Mem::permute<X2, X3, X0, X1>(d.v()[0]), Mem::permute<X2, X3, X0, X1>(d.v()[1])); }
665 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::badc() const { return M256::create(Mem::permute<X1, X0, X3, X2>(d.v()[0]), Mem::permute<X1, X0, X3, X2>(d.v()[1])); }
666 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::aaaa() const { return M256::create(Mem::permute<X0, X0, X0, X0>(d.v()[0]), Mem::permute<X0, X0, X0, X0>(d.v()[1])); }
667 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::bbbb() const { return M256::create(Mem::permute<X1, X1, X1, X1>(d.v()[0]), Mem::permute<X1, X1, X1, X1>(d.v()[1])); }
668 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::cccc() const { return M256::create(Mem::permute<X2, X2, X2, X2>(d.v()[0]), Mem::permute<X2, X2, X2, X2>(d.v()[1])); }
669 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::dddd() const { return M256::create(Mem::permute<X3, X3, X3, X3>(d.v()[0]), Mem::permute<X3, X3, X3, X3>(d.v()[1])); }
670 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::bcad() const { return M256::create(Mem::permute<X1, X2, X0, X3>(d.v()[0]), Mem::permute<X1, X2, X0, X3>(d.v()[1])); }
671 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::bcda() const { return M256::create(Mem::permute<X1, X2, X3, X0>(d.v()[0]), Mem::permute<X1, X2, X3, X0>(d.v()[1])); }
672 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::dabc() const { return M256::create(Mem::permute<X3, X0, X1, X2>(d.v()[0]), Mem::permute<X3, X0, X1, X2>(d.v()[1])); }
673 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::acbd() const { return M256::create(Mem::permute<X0, X2, X1, X3>(d.v()[0]), Mem::permute<X0, X2, X1, X3>(d.v()[1])); }
674 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::dbca() const { return M256::create(Mem::permute<X3, X1, X2, X0>(d.v()[0]), Mem::permute<X3, X1, X2, X0>(d.v()[1])); }
675 template<> inline const sfloat_v INTRINSIC CONST Vector<sfloat>::dcba() const { return M256::create(Mem::permute<X3, X2, X1, X0>(d.v()[0]), Mem::permute<X3, X2, X1, X0>(d.v()[1])); }
677 #define VC_SWIZZLES_16BIT_IMPL(T) \
678 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \
679 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \
680 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \
681 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \
682 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \
683 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \
684 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \
685 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \
686 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \
687 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \
688 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \
689 template<> inline const Vector<T> INTRINSIC CONST Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); }
690 VC_SWIZZLES_16BIT_IMPL(short)
691 VC_SWIZZLES_16BIT_IMPL(unsigned short)
692 #undef VC_SWIZZLES_16BIT_IMPL
695 #include "../common/operators.h"
698 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes)
700 gather(mem, indexes);
702 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const Vector<IndexT> indexes)
704 gather(mem, indexes);
707 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask)
710 gather(mem, indexes, mask);
713 template<typename T> template<typename IndexT> inline ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const Vector<IndexT> indexes, MaskArg mask)
716 gather(mem, indexes, mask);
719 template<typename T> template<typename S1, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, const IT indexes)
721 gather(array, member1, indexes);
723 template<typename T> template<typename S1, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, const IT indexes, MaskArg mask)
726 gather(array, member1, indexes, mask);
728 template<typename T> template<typename S1, typename S2, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
730 gather(array, member1, member2, indexes);
732 template<typename T> template<typename S1, typename S2, typename IT> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes, MaskArg mask)
735 gather(array, member1, member2, indexes, mask);
737 template<typename T> template<typename S1, typename IT1, typename IT2> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
739 gather(array, ptrMember1, outerIndexes, innerIndexes);
741 template<typename T> template<typename S1, typename IT1, typename IT2> inline ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes, MaskArg mask)
744 gather(array, ptrMember1, outerIndexes, innerIndexes, mask);
747 template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} };
748 template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size>
750 static void check() {
751 VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries);
754 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const EntryType *mem, const Index indexes)
756 IndexSizeChecker<Index, Size>::check();
757 d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]);
759 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const EntryType *mem, const Index indexes)
761 IndexSizeChecker<Index, Size>::check();
762 d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
764 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<float8>::gather(const EntryType *mem, const Index indexes)
766 IndexSizeChecker<Index, Size>::check();
767 d.v()[0] = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
768 d.v()[1] = _mm_setr_ps(mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
770 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const EntryType *mem, const Index indexes)
772 IndexSizeChecker<Index, Size>::check();
773 d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
775 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const EntryType *mem, const Index indexes)
777 IndexSizeChecker<Index, Size>::check();
778 d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
780 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const EntryType *mem, const Index indexes)
782 IndexSizeChecker<Index, Size>::check();
783 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
784 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
786 template<> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const EntryType *mem, const Index indexes)
788 IndexSizeChecker<Index, Size>::check();
789 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
790 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
793 #ifdef VC_USE_SET_GATHERS
794 template<typename T> template<typename IT> inline void ALWAYS_INLINE Vector<T>::gather(const EntryType *mem, Vector<IT> indexes, MaskArg mask)
796 IndexSizeChecker<Vector<IT>, Size>::check();
797 indexes.setZero(!static_cast<typename Vector<IT>::Mask>(mask));
798 (*this)(mask) = Vector<T>(mem, indexes);
802 #ifdef VC_USE_BSF_GATHERS
803 #define VC_MASKED_GATHER \
804 int bits = mask.toInt(); \
806 const int i = _bit_scan_forward(bits); \
807 bits &= ~(1 << i); /* btr? */ \
808 d.m(i) = ith_value(i); \
810 #elif defined(VC_USE_POPCNT_BSF_GATHERS)
811 #define VC_MASKED_GATHER \
812 unsigned int bits = mask.toInt(); \
813 unsigned int low, high = 0; \
814 switch (mask.count()) { \
816 high = _bit_scan_reverse(bits); \
817 d.m(high) = ith_value(high); \
818 high = (1 << high); \
820 low = _bit_scan_forward(bits); \
821 bits ^= high | (1 << low); \
822 d.m(low) = ith_value(low); \
824 high = _bit_scan_reverse(bits); \
825 d.m(high) = ith_value(high); \
826 high = (1 << high); \
828 low = _bit_scan_forward(bits); \
829 bits ^= high | (1 << low); \
830 d.m(low) = ith_value(low); \
832 high = _bit_scan_reverse(bits); \
833 d.m(high) = ith_value(high); \
834 high = (1 << high); \
836 low = _bit_scan_forward(bits); \
837 bits ^= high | (1 << low); \
838 d.m(low) = ith_value(low); \
840 high = _bit_scan_reverse(bits); \
841 d.m(high) = ith_value(high); \
843 low = _bit_scan_forward(bits); \
844 d.m(low) = ith_value(low); \
849 #define VC_MASKED_GATHER \
850 if (mask.isEmpty()) { \
853 for_all_vector_entries(i, \
854 if (mask[i]) d.m(i) = ith_value(i); \
858 template<typename T> template<typename Index>
859 inline void INTRINSIC Vector<T>::gather(const EntryType *mem, Index indexes, MaskArg mask)
861 IndexSizeChecker<Index, Size>::check();
862 #define ith_value(_i_) (mem[indexes[_i_]])
867 template<> template<typename S1, typename IT>
868 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
870 IndexSizeChecker<IT, Size>::check();
871 d.v() = _mm_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1));
873 template<> template<typename S1, typename IT>
874 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
876 IndexSizeChecker<IT, Size>::check();
877 d.v() = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
878 array[indexes[3]].*(member1));
880 template<> template<typename S1, typename IT>
881 inline void ALWAYS_INLINE FLATTEN Vector<float8>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
883 IndexSizeChecker<IT, Size>::check();
884 d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
885 array[indexes[3]].*(member1));
886 d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1),
887 array[indexes[7]].*(member1));
889 template<> template<typename S1, typename IT>
890 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
892 IndexSizeChecker<IT, Size>::check();
893 d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
894 array[indexes[3]].*(member1));
896 template<> template<typename S1, typename IT>
897 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
899 IndexSizeChecker<IT, Size>::check();
900 d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
901 array[indexes[3]].*(member1));
903 template<> template<typename S1, typename IT>
904 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
906 IndexSizeChecker<IT, Size>::check();
907 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
908 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
909 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
911 template<> template<typename S1, typename IT>
912 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes)
914 IndexSizeChecker<IT, Size>::check();
915 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
916 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
917 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
919 template<typename T> template<typename S1, typename IT>
920 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, const IT indexes, MaskArg mask)
922 IndexSizeChecker<IT, Size>::check();
923 #define ith_value(_i_) (array[indexes[_i_]].*(member1))
927 template<> template<typename S1, typename S2, typename IT>
928 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
930 IndexSizeChecker<IT, Size>::check();
931 d.v() = _mm_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2));
933 template<> template<typename S1, typename S2, typename IT>
934 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
936 IndexSizeChecker<IT, Size>::check();
937 d.v() = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
938 array[indexes[3]].*(member1).*(member2));
940 template<> template<typename S1, typename S2, typename IT>
941 inline void ALWAYS_INLINE FLATTEN Vector<float8>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
943 IndexSizeChecker<IT, Size>::check();
944 d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
945 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
946 d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
947 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
949 template<> template<typename S1, typename S2, typename IT>
950 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
952 IndexSizeChecker<IT, Size>::check();
953 d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
954 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
956 template<> template<typename S1, typename S2, typename IT>
957 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
959 IndexSizeChecker<IT, Size>::check();
960 d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
961 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
963 template<> template<typename S1, typename S2, typename IT>
964 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
966 IndexSizeChecker<IT, Size>::check();
967 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
968 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
969 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
971 template<> template<typename S1, typename S2, typename IT>
972 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes)
974 IndexSizeChecker<IT, Size>::check();
975 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
976 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
977 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
979 template<typename T> template<typename S1, typename S2, typename IT>
980 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, const IT indexes, MaskArg mask)
982 IndexSizeChecker<IT, Size>::check();
983 #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2))
987 template<> template<typename S1, typename IT1, typename IT2>
988 inline void ALWAYS_INLINE FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
990 IndexSizeChecker<IT1, Size>::check();
991 IndexSizeChecker<IT2, Size>::check();
992 d.v() = _mm_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]],
993 (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]]);
995 template<> template<typename S1, typename IT1, typename IT2>
996 inline void ALWAYS_INLINE FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
998 IndexSizeChecker<IT1, Size>::check();
999 IndexSizeChecker<IT2, Size>::check();
1000 d.v() = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1001 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1003 template<> template<typename S1, typename IT1, typename IT2>
1004 inline void ALWAYS_INLINE FLATTEN Vector<float8>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1006 IndexSizeChecker<IT1, Size>::check();
1007 IndexSizeChecker<IT2, Size>::check();
1008 d.v()[0] = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1009 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1010 d.v()[1] = _mm_setr_ps((array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1011 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1013 template<> template<typename S1, typename IT1, typename IT2>
1014 inline void ALWAYS_INLINE FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1016 IndexSizeChecker<IT1, Size>::check();
1017 IndexSizeChecker<IT2, Size>::check();
1018 d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1019 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1021 template<> template<typename S1, typename IT1, typename IT2>
1022 inline void ALWAYS_INLINE FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1024 IndexSizeChecker<IT1, Size>::check();
1025 IndexSizeChecker<IT2, Size>::check();
1026 d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1027 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1029 template<> template<typename S1, typename IT1, typename IT2>
1030 inline void ALWAYS_INLINE FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1032 IndexSizeChecker<IT1, Size>::check();
1033 IndexSizeChecker<IT2, Size>::check();
1034 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1035 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
1036 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1037 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1039 template<> template<typename S1, typename IT1, typename IT2>
1040 inline void ALWAYS_INLINE FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes)
1042 IndexSizeChecker<IT1, Size>::check();
1043 IndexSizeChecker<IT2, Size>::check();
1044 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1045 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
1046 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1047 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1049 template<typename T> template<typename S1, typename IT1, typename IT2>
1050 inline void ALWAYS_INLINE FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes, MaskArg mask)
1052 IndexSizeChecker<IT1, Size>::check();
1053 IndexSizeChecker<IT2, Size>::check();
1054 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1059 #undef VC_MASKED_GATHER
1060 #ifdef VC_USE_BSF_SCATTERS
1061 #define VC_MASKED_SCATTER \
1062 int bits = mask.toInt(); \
1064 const int i = _bit_scan_forward(bits); \
1065 bits ^= (1 << i); /* btr? */ \
1066 ith_value(i) = d.m(i); \
1068 #elif defined(VC_USE_POPCNT_BSF_SCATTERS)
1069 #define VC_MASKED_SCATTER \
1070 unsigned int bits = mask.toInt(); \
1071 unsigned int low, high = 0; \
1072 switch (mask.count()) { \
1074 high = _bit_scan_reverse(bits); \
1075 ith_value(high) = d.m(high); \
1076 high = (1 << high); \
1078 low = _bit_scan_forward(bits); \
1079 bits ^= high | (1 << low); \
1080 ith_value(low) = d.m(low); \
1082 high = _bit_scan_reverse(bits); \
1083 ith_value(high) = d.m(high); \
1084 high = (1 << high); \
1086 low = _bit_scan_forward(bits); \
1087 bits ^= high | (1 << low); \
1088 ith_value(low) = d.m(low); \
1090 high = _bit_scan_reverse(bits); \
1091 ith_value(high) = d.m(high); \
1092 high = (1 << high); \
1094 low = _bit_scan_forward(bits); \
1095 bits ^= high | (1 << low); \
1096 ith_value(low) = d.m(low); \
1098 high = _bit_scan_reverse(bits); \
1099 ith_value(high) = d.m(high); \
1101 low = _bit_scan_forward(bits); \
1102 ith_value(low) = d.m(low); \
1107 #define VC_MASKED_SCATTER \
1108 if (mask.isEmpty()) { \
1111 for_all_vector_entries(i, \
1112 if (mask[i]) ith_value(i) = d.m(i); \
1116 template<typename T> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(EntryType *mem, const Index indexes) const
1118 for_all_vector_entries(i,
1119 mem[indexes[i]] = d.m(i);
1122 template<typename T> template<typename Index> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(EntryType *mem, const Index indexes, MaskArg mask) const
1124 #define ith_value(_i_) mem[indexes[_i_]]
1128 template<typename T> template<typename S1, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, const IT indexes) const
1130 for_all_vector_entries(i,
1131 array[indexes[i]].*(member1) = d.m(i);
1134 template<typename T> template<typename S1, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, const IT indexes, MaskArg mask) const
1136 #define ith_value(_i_) array[indexes[_i_]].*(member1)
1140 template<typename T> template<typename S1, typename S2, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, const IT indexes) const
1142 for_all_vector_entries(i,
1143 array[indexes[i]].*(member1).*(member2) = d.m(i);
1146 template<typename T> template<typename S1, typename S2, typename IT> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, const IT indexes, MaskArg mask) const
1148 #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2)
1152 template<typename T> template<typename S1, typename IT1, typename IT2> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes) const
1154 for_all_vector_entries(i,
1155 (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i);
1158 template<typename T> template<typename S1, typename IT1, typename IT2> inline void ALWAYS_INLINE FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, const IT1 outerIndexes, const IT2 innerIndexes, MaskArg mask) const
1160 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1165 ///////////////////////////////////////////////////////////////////////////////////////////
1167 template<typename T> inline typename Vector<T>::EntryType PURE INTRINSIC Vector<T>::operator[](size_t index) const
1172 template<> inline double PURE INTRINSIC Vector<double>::operator[](size_t index) const
1174 if (__builtin_constant_p(index)) {
1175 return extract_double_imm(d.v(), index);
1179 template<> inline float PURE INTRINSIC Vector<float>::operator[](size_t index) const
1181 return extract_float(d.v(), index);
1183 template<> inline float PURE INTRINSIC Vector<float8>::operator[](size_t index) const
1185 if (__builtin_constant_p(index)) {
1187 return extract_float_imm(d.v()[0], index);
1189 return extract_float_imm(d.v()[1], index - 4);
1193 template<> inline int PURE INTRINSIC Vector<int>::operator[](size_t index) const
1195 if (__builtin_constant_p(index)) {
1196 #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following
1198 if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull;
1199 if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32;
1201 if (index == 0) return _mm_cvtsi128_si32(d.v());
1204 #ifdef VC_IMPL_SSE4_1
1205 return _mm_extract_epi32(d.v(), index);
1207 return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4));
1212 template<> inline unsigned int PURE INTRINSIC Vector<unsigned int>::operator[](size_t index) const
1214 if (__builtin_constant_p(index)) {
1215 #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following
1217 if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull;
1218 if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32;
1220 if (index == 0) return _mm_cvtsi128_si32(d.v());
1223 #ifdef VC_IMPL_SSE4_1
1224 return _mm_extract_epi32(d.v(), index);
1226 return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4));
1231 template<> inline short PURE INTRINSIC Vector<short>::operator[](size_t index) const
1233 if (__builtin_constant_p(index)) {
1234 return _mm_extract_epi16(d.v(), index);
1238 template<> inline unsigned short PURE INTRINSIC Vector<unsigned short>::operator[](size_t index) const
1240 if (__builtin_constant_p(index)) {
1241 return _mm_extract_epi16(d.v(), index);
1246 ///////////////////////////////////////////////////////////////////////////////////////////
1247 // horizontal ops {{{1
1248 #ifndef VC_IMPL_SSE4_1
1249 // without SSE4.1 integer multiplication is slow and we rather multiply the scalars
1250 template<> inline int INTRINSIC Vector<int>::product() const
1252 return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
1254 template<> inline unsigned int INTRINSIC Vector<unsigned int>::product() const
1256 return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
1259 template<typename T> inline typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const
1261 Vector<T> tmp = std::numeric_limits<Vector<T> >::max();
1265 template<typename T> inline typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const
1267 Vector<T> tmp = std::numeric_limits<Vector<T> >::min();
1271 template<typename T> inline typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const
1273 Vector<T> tmp(VectorSpecialInitializerOne::One);
1275 return tmp.product();
1277 template<typename T> inline typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const
1279 Vector<T> tmp(VectorSpecialInitializerZero::Zero);
1284 ///////////////////////////////////////////////////////////////////////////////////////////
1286 template<> inline Vector<float> INTRINSIC Vector<float>::copySign(Vector<float>::AsArg reference) const
1289 _mm_and_ps(reference.d.v(), _mm_setsignmask_ps()),
1290 _mm_and_ps(d.v(), _mm_setabsmask_ps())
1293 template<> inline Vector<float8> INTRINSIC Vector<float8>::copySign(Vector<float8>::AsArg reference) const
1295 return M256::create( _mm_or_ps(
1296 _mm_and_ps(reference.d.v()[0], _mm_setsignmask_ps()),
1297 _mm_and_ps(d.v()[0], _mm_setabsmask_ps())
1299 _mm_and_ps(reference.d.v()[1], _mm_setsignmask_ps()),
1300 _mm_and_ps(d.v()[1], _mm_setabsmask_ps())
1304 template<> inline Vector<double> INTRINSIC Vector<double>::copySign(Vector<double>::AsArg reference) const
1307 _mm_and_pd(reference.d.v(), _mm_setsignmask_pd()),
1308 _mm_and_pd(d.v(), _mm_setabsmask_pd())
1312 template<> inline Vector<float> INTRINSIC Vector<float>::exponent() const
1314 VC_ASSERT((*this > 0.f).isFull());
1315 __m128i tmp = _mm_srli_epi32(_mm_castps_si128(d.v()), 23);
1316 tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
1317 return _mm_cvtepi32_ps(tmp);
1319 template<> inline Vector<float8> INTRINSIC Vector<float8>::exponent() const
1321 VC_ASSERT((*this > 0.f).isFull());
1322 __m128i tmp0 = _mm_srli_epi32(_mm_castps_si128(d.v()[0]), 23);
1323 __m128i tmp1 = _mm_srli_epi32(_mm_castps_si128(d.v()[1]), 23);
1324 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
1325 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
1326 return M256::create( _mm_cvtepi32_ps(tmp0), _mm_cvtepi32_ps(tmp1));
1328 template<> inline Vector<double> INTRINSIC Vector<double>::exponent() const
1330 VC_ASSERT((*this > 0.).isFull());
1331 __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(d.v()), 52);
1332 tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
1333 return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
1337 static inline ALWAYS_INLINE void _doRandomStep(Vector<unsigned int> &state0,
1338 Vector<unsigned int> &state1)
1340 state0.load(&Vc::RandomState[0]);
1341 state1.load(&Vc::RandomState[uint_v::Size]);
1342 (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]);
1343 uint_v(_mm_xor_si128((state0 * 0xdeece66du + 11).data(), _mm_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]);
1346 template<typename T> inline ALWAYS_INLINE Vector<T> Vector<T>::Random()
1348 Vector<unsigned int> state0, state1;
1349 _doRandomStep(state0, state1);
1350 return state0.reinterpretCast<Vector<T> >();
1353 template<> inline ALWAYS_INLINE Vector<float> Vector<float>::Random()
1355 Vector<unsigned int> state0, state1;
1356 _doRandomStep(state0, state1);
1357 return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1360 template<> inline ALWAYS_INLINE Vector<float8> Vector<float8>::Random()
1362 Vector<unsigned int> state0, state1;
1363 _doRandomStep(state0, state1);
1364 state1 ^= state0 >> 16;
1365 return M256::create(
1366 _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one()),
1367 _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state1.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one())
1371 template<> inline ALWAYS_INLINE Vector<double> Vector<double>::Random()
1373 typedef unsigned long long uint64 MAY_ALIAS;
1374 uint64 state0 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[8]);
1375 uint64 state1 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[10]);
1376 const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Vc::RandomState[8]));
1377 *reinterpret_cast<uint64 *>(&Vc::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
1378 *reinterpret_cast<uint64 *>(&Vc::RandomState[10]) = (state1 * 0x5deece66dull + 11);
1379 return (Vector<double>(_mm_castsi128_pd(_mm_srli_epi64(state, 12))) | One()) - One();
1385 #include "undomacros.h"
1387 // vim: foldmethod=marker