1 /* This file is part of the Vc library.
3 Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
27 ALIGN(64) extern unsigned int RandomState[16];
32 ///////////////////////////////////////////////////////////////////////////////////////////
34 template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {}
35 template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {}
36 template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum)
37 : d(HV::load(IndexesFromZeroData<T>::address(), Aligned)) {}
39 template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::Zero() { return HT::zero(); }
40 template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::One() { return HT::one(); }
41 template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::IndexesFromZero() { return HV::load(IndexesFromZeroData<T>::address(), Aligned); }
43 template<typename T> template<typename T2> Vc_ALWAYS_INLINE Vector<T>::Vector(VC_ALIGNED_PARAMETER(Vector<T2>) x)
44 : d(StaticCastHelper<T2, T>::cast(x.data())) {}
46 template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(EntryType x) : d(HT::set(x)) {}
47 template<> Vc_ALWAYS_INLINE Vector<double>::Vector(EntryType x) : d(_mm256_set1_pd(x)) {}
50 ///////////////////////////////////////////////////////////////////////////////////////////
52 template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); }
53 template<typename T> template<typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); }
54 template<typename T> template<typename OtherT> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); }
55 template<typename T> template<typename OtherT, typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); }
57 ///////////////////////////////////////////////////////////////////////////////////////////
58 // load member functions {{{1
59 template<typename T> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem)
64 template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem, A align)
66 d.v() = HV::load(mem, align);
69 template<typename T> template<typename OtherT> Vc_INTRINSIC void Vector<T>::load(const OtherT *mem)
75 template<typename DstT, typename SrcT, typename Flags> struct LoadHelper;
78 template<typename Flags> struct LoadHelper<float, double, Flags> {
79 static m256 load(const double *mem, Flags f)
81 return concat(_mm256_cvtpd_ps(VectorHelper<m256d>::load(&mem[0], f)),
82 _mm256_cvtpd_ps(VectorHelper<m256d>::load(&mem[4], f)));
85 template<typename Flags> struct LoadHelper<float, unsigned int, Flags> {
86 static m256 load(const unsigned int *mem, Flags f)
88 return StaticCastHelper<unsigned int, float>::cast(VectorHelper<m256i>::load(mem, f));
91 template<typename Flags> struct LoadHelper<float, int, Flags> {
92 static m256 load(const int *mem, Flags f)
94 return StaticCastHelper<int, float>::cast(VectorHelper<m256i>::load(mem, f));
97 template<typename Flags> struct LoadHelper<float, unsigned short, Flags> {
98 static m256 load(const unsigned short *mem, Flags f)
100 return StaticCastHelper<unsigned short, float>::cast(VectorHelper<m128i>::load(mem, f));
103 template<typename Flags> struct LoadHelper<float, short, Flags> {
104 static m256 load(const short *mem, Flags f)
106 return StaticCastHelper<short, float>::cast(VectorHelper<m128i>::load(mem, f));
109 template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
110 static m256 load(const unsigned char *mem, Flags f)
112 return StaticCastHelper<unsigned int, float>::cast(LoadHelper<unsigned int, unsigned char, Flags>::load(mem, f));
115 template<typename Flags> struct LoadHelper<float, signed char, Flags> {
116 static m256 load(const signed char *mem, Flags f)
118 return StaticCastHelper<int, float>::cast(LoadHelper<int, signed char, Flags>::load(mem, f));
122 template<typename SrcT, typename Flags> struct LoadHelper<sfloat, SrcT, Flags> : public LoadHelper<float, SrcT, Flags> {};
125 template<typename Flags> struct LoadHelper<int, unsigned int, Flags> {
126 static m256i load(const unsigned int *mem, Flags f)
128 return VectorHelper<m256i>::load(mem, f);
131 template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
132 static m256i load(const unsigned short *mem, Flags f)
134 return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<m128i>::load(mem, f));
137 template<typename Flags> struct LoadHelper<int, short, Flags> {
138 static m256i load(const short *mem, Flags f)
140 return StaticCastHelper<short, int>::cast(VectorHelper<m128i>::load(mem, f));
143 template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
144 static m256i load(const unsigned char *mem, Flags)
146 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
147 // it, or we risk an out-of-bounds read and an unaligned load exception
148 const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
149 const m128i epu16 = _mm_cvtepu8_epi16(epu8);
150 return StaticCastHelper<unsigned short, unsigned int>::cast(epu16);
153 template<typename Flags> struct LoadHelper<int, signed char, Flags> {
154 static m256i load(const signed char *mem, Flags)
156 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
157 // it, or we risk an out-of-bounds read and an unaligned load exception
158 const m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
159 const m128i epi16 = _mm_cvtepi8_epi16(epi8);
160 return StaticCastHelper<short, int>::cast(epi16);
165 template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
166 static m256i load(const unsigned short *mem, Flags f)
168 return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<m128i>::load(mem, f));
171 template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
172 static m256i load(const unsigned char *mem, Flags)
174 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
175 // it, or we risk an out-of-bounds read and an unaligned load exception
176 const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
177 const m128i epu16 = _mm_cvtepu8_epi16(epu8);
178 return StaticCastHelper<unsigned short, unsigned int>::cast(epu16);
183 template<typename Flags> struct LoadHelper<short, unsigned short, Flags> {
184 static m128i load(const unsigned short *mem, Flags f)
186 return StaticCastHelper<unsigned short, short>::cast(VectorHelper<m128i>::load(mem, f));
189 template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
190 static m128i load(const unsigned char *mem, Flags)
192 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
193 // it, or we risk an out-of-bounds read and an unaligned load exception
194 const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
195 return _mm_cvtepu8_epi16(epu8);
198 template<typename Flags> struct LoadHelper<short, signed char, Flags> {
199 static m128i load(const signed char *mem, Flags)
201 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
202 // it, or we risk an out-of-bounds read and an unaligned load exception
203 const m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
204 return _mm_cvtepi8_epi16(epi8);
208 // unsigned short {{{2
209 template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
210 static m128i load(const unsigned char *mem, Flags)
212 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
213 // it, or we risk an out-of-bounds read and an unaligned load exception
214 const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
215 return _mm_cvtepu8_epi16(epu8);
219 // general load, implemented via LoadHelper {{{2
220 template<typename DstT> template<typename SrcT, typename Flags> Vc_INTRINSIC void Vector<DstT>::load(const SrcT *x, Flags f)
222 d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f);
225 ///////////////////////////////////////////////////////////////////////////////////////////
227 template<typename T> Vc_INTRINSIC void Vector<T>::setZero()
231 template<typename T> Vc_INTRINSIC void Vector<T>::setZero(const Mask &k)
233 data() = HV::andnot_(avx_cast<VectorType>(k.data()), data());
236 template<> Vc_INTRINSIC void Vector<double>::setQnan()
238 data() = _mm256_setallone_pd();
240 template<> Vc_INTRINSIC void Vector<double>::setQnan(MaskArg k)
242 data() = _mm256_or_pd(data(), k.dataD());
244 template<> Vc_INTRINSIC void Vector<float>::setQnan()
246 data() = _mm256_setallone_ps();
248 template<> Vc_INTRINSIC void Vector<float>::setQnan(MaskArg k)
250 data() = _mm256_or_ps(data(), k.data());
252 template<> Vc_INTRINSIC void Vector<sfloat>::setQnan()
254 data() = _mm256_setallone_ps();
256 template<> Vc_INTRINSIC void Vector<sfloat>::setQnan(MaskArg k)
258 data() = _mm256_or_ps(data(), k.data());
261 ///////////////////////////////////////////////////////////////////////////////////////////
263 template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem) const
265 HV::store(mem, data(), Aligned);
267 template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask) const
269 HV::store(mem, data(), avx_cast<VectorType>(mask.data()), Aligned);
271 template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, A align) const
273 HV::store(mem, data(), align);
275 template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask, A align) const
277 HV::store(mem, data(), avx_cast<VectorType>(mask.data()), align);
280 ///////////////////////////////////////////////////////////////////////////////////////////
281 // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1
282 template<typename T> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<T>::Vector(const Vector<typename HT::ConcatType> *a)
286 template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<float>::Vector(const Vector<HT::ConcatType> *a)
287 : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data())))
290 template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<short>::Vector(const Vector<HT::ConcatType> *a)
291 : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data())))
294 template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<unsigned short>::Vector(const Vector<HT::ConcatType> *a)
295 : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data())))
298 template<typename T> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::expand(Vector<typename HT::ConcatType> *x) const
302 template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::expand(Vector<HT::ConcatType> *x) const
304 x[0].data() = _mm256_cvtps_pd(lo128(d.v()));
305 x[1].data() = _mm256_cvtps_pd(hi128(d.v()));
307 template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::expand(Vector<HT::ConcatType> *x) const
309 x[0].data() = concat(_mm_cvtepi16_epi32(d.v()),
310 _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v())));
312 template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::expand(Vector<HT::ConcatType> *x) const
314 x[0].data() = concat(_mm_cvtepu16_epi32(d.v()),
315 _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v())));
318 ///////////////////////////////////////////////////////////////////////////////////////////
320 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE &Vector<T>::abcd() const { return *this; }
321 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); }
322 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
323 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); }
324 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); }
325 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); }
326 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); }
327 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); }
328 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); }
329 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); }
330 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); }
331 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); }
332 template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); }
334 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::cdab() const { return Mem::shuffle128<X1, X0>(data(), data()); }
335 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
336 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); }
337 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); }
338 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); }
339 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); }
340 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bcad() const { return Mem::shuffle<X1, Y0, X2, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); }
341 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bcda() const { return Mem::shuffle<X1, Y0, X3, Y2>(data(), Mem::shuffle128<X1, X0>(data(), data())); }
342 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dabc() const { return Mem::shuffle<X1, Y0, X3, Y2>(Mem::shuffle128<X1, X0>(data(), data()), data()); }
343 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::acbd() const { return Mem::shuffle<X0, Y0, X3, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); }
344 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dbca() const { return Mem::shuffle<X1, Y1, X2, Y2>(Mem::shuffle128<X1, X1>(data(), data()), Mem::shuffle128<X0, X0>(data(), data())); }
345 template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dcba() const { return cdab().badc(); }
347 #define VC_SWIZZLES_16BIT_IMPL(T) \
348 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \
349 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \
350 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \
351 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \
352 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \
353 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \
354 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \
355 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \
356 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \
357 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \
358 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \
359 template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); }
360 VC_SWIZZLES_16BIT_IMPL(short)
361 VC_SWIZZLES_16BIT_IMPL(unsigned short)
362 #undef VC_SWIZZLES_16BIT_IMPL
364 ///////////////////////////////////////////////////////////////////////////////////////////
366 template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x)
368 if (HasVectorDivision) {
369 return operator/=(Vector<T>(x));
371 for_all_vector_entries(i,
376 template<typename T> template<typename TT> inline Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const
378 if (HasVectorDivision) {
379 return operator/(Vector<T>(x));
382 for_all_vector_entries(i,
383 r.d.m(i) = d.m(i) / x;
387 // per default fall back to scalar division
388 template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x)
390 for_all_vector_entries(i,
396 template<typename T> inline Vector<T> Vc_PURE Vector<T>::operator/(const Vector<T> &x) const
399 for_all_vector_entries(i,
400 r.d.m(i) = d.m(i) / x.d.m(i);
404 // specialize division on type
405 static Vc_INTRINSIC m256i Vc_CONST divInt(param256i a, param256i b) {
406 const m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
407 const m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
408 const m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
409 const m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
411 _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
412 _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2))
415 template<> inline Vector<int> &Vector<int>::operator/=(const Vector<int> &x)
417 d.v() = divInt(d.v(), x.d.v());
420 template<> inline Vector<int> Vc_PURE Vector<int>::operator/(const Vector<int> &x) const
422 return divInt(d.v(), x.d.v());
424 static inline m256i Vc_CONST divUInt(param256i a, param256i b) {
425 m256d loa = _mm256_cvtepi32_pd(lo128(a));
426 m256d hia = _mm256_cvtepi32_pd(hi128(a));
427 m256d lob = _mm256_cvtepi32_pd(lo128(b));
428 m256d hib = _mm256_cvtepi32_pd(hi128(b));
429 // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32)
430 // to get the right number back we have to add 2^32 where a >= 2^31
431 loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.)));
432 hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.)));
433 // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and
434 // we rather want the standard stuff fast
436 // there is one remaining problem: a >= 2^31 and b == 1
437 // in that case the return value would be 2^31
438 return avx_cast<m256i>(_mm256_blendv_ps(avx_cast<m256>(concat(
439 _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
440 _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib))
441 )), avx_cast<m256>(a), avx_cast<m256>(concat(
442 _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()),
443 _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32())))));
445 template<> Vc_ALWAYS_INLINE Vector<unsigned int> &Vector<unsigned int>::operator/=(const Vector<unsigned int> &x)
447 d.v() = divUInt(d.v(), x.d.v());
450 template<> Vc_ALWAYS_INLINE Vector<unsigned int> Vc_PURE Vector<unsigned int>::operator/(const Vector<unsigned int> &x) const
452 return divUInt(d.v(), x.d.v());
454 template<typename T> static inline m128i Vc_CONST divShort(param128i a, param128i b)
456 const m256 r = _mm256_div_ps(StaticCastHelper<T, float>::cast(a),
457 StaticCastHelper<T, float>::cast(b));
458 return StaticCastHelper<float, T>::cast(r);
460 template<> Vc_ALWAYS_INLINE Vector<short> &Vector<short>::operator/=(const Vector<short> &x)
462 d.v() = divShort<short>(d.v(), x.d.v());
465 template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vector<short>::operator/(const Vector<short> &x) const
467 return divShort<short>(d.v(), x.d.v());
469 template<> Vc_ALWAYS_INLINE Vector<unsigned short> &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x)
471 d.v() = divShort<unsigned short>(d.v(), x.d.v());
474 template<> Vc_ALWAYS_INLINE Vector<unsigned short> Vc_PURE Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const
476 return divShort<unsigned short>(d.v(), x.d.v());
478 template<> Vc_INTRINSIC float_v &float_v::operator/=(const float_v &x)
480 d.v() = _mm256_div_ps(d.v(), x.d.v());
483 template<> Vc_INTRINSIC float_v Vc_PURE float_v::operator/(const float_v &x) const
485 return _mm256_div_ps(d.v(), x.d.v());
487 template<> Vc_INTRINSIC sfloat_v &sfloat_v::operator/=(const sfloat_v &x)
489 d.v() = _mm256_div_ps(d.v(), x.d.v());
492 template<> Vc_INTRINSIC sfloat_v Vc_PURE sfloat_v::operator/(const sfloat_v &x) const
494 return _mm256_div_ps(d.v(), x.d.v());
496 template<> Vc_INTRINSIC double_v &double_v::operator/=(const double_v &x)
498 d.v() = _mm256_div_pd(d.v(), x.d.v());
501 template<> Vc_INTRINSIC double_v Vc_PURE double_v::operator/(const double_v &x) const
503 return _mm256_div_pd(d.v(), x.d.v());
506 ///////////////////////////////////////////////////////////////////////////////////////////
508 #define OP_IMPL(T, symbol) \
509 template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(AsArg x) \
511 for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \
514 template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator symbol(AsArg x) const \
517 for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \
522 OP_IMPL(unsigned int, <<)
523 OP_IMPL(unsigned int, >>)
526 OP_IMPL(unsigned short, <<)
527 OP_IMPL(unsigned short, >>)
530 template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator>>=(int shift) {
531 d.v() = VectorHelper<T>::shiftRight(d.v(), shift);
532 return *static_cast<Vector<T> *>(this);
534 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator>>(int shift) const {
535 return VectorHelper<T>::shiftRight(d.v(), shift);
537 template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator<<=(int shift) {
538 d.v() = VectorHelper<T>::shiftLeft(d.v(), shift);
539 return *static_cast<Vector<T> *>(this);
541 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator<<(int shift) const {
542 return VectorHelper<T>::shiftLeft(d.v(), shift);
545 #define OP_IMPL(T, symbol, fun) \
546 template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \
547 template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator symbol(AsArg x) const { return Vector<T>(HV::fun(d.v(), x.d.v())); }
548 OP_IMPL(int, &, and_)
550 OP_IMPL(int, ^, xor_)
551 OP_IMPL(unsigned int, &, and_)
552 OP_IMPL(unsigned int, |, or_)
553 OP_IMPL(unsigned int, ^, xor_)
554 OP_IMPL(short, &, and_)
555 OP_IMPL(short, |, or_)
556 OP_IMPL(short, ^, xor_)
557 OP_IMPL(unsigned short, &, and_)
558 OP_IMPL(unsigned short, |, or_)
559 OP_IMPL(unsigned short, ^, xor_)
560 OP_IMPL(float, &, and_)
561 OP_IMPL(float, |, or_)
562 OP_IMPL(float, ^, xor_)
563 OP_IMPL(sfloat, &, and_)
564 OP_IMPL(sfloat, |, or_)
565 OP_IMPL(sfloat, ^, xor_)
566 OP_IMPL(double, &, and_)
567 OP_IMPL(double, |, or_)
568 OP_IMPL(double, ^, xor_)
572 #include "../common/operators.h"
574 template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const
576 return avx_cast<m256>(_mm256_srai_epi32(avx_cast<m256i>(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31));
578 template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const
580 return avx_cast<m256>(_mm256_srai_epi32(avx_cast<m256i>(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31));
582 template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const
584 return Mem::permute<X1, X1, X3, X3>(avx_cast<m256>(
585 _mm256_srai_epi32(avx_cast<m256i>(_mm256_and_pd(_mm256_setsignmask_pd(), d.v())), 31)
589 // Better implementation (hopefully) with _mm256_set_
590 //X template<typename T> template<typename Index> Vector<T>::Vector(const EntryType *mem, const Index *indexes)
592 //X for_all_vector_entries(int i,
593 //X d.m(i) = mem[indexes[i]];
596 template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes)
598 gather(mem, indexes);
600 template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes)
602 gather(mem, indexes);
605 template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask)
608 gather(mem, indexes, mask);
611 template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes, MaskArg mask)
614 gather(mem, indexes, mask);
617 template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
619 gather(array, member1, indexes);
621 template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
624 gather(array, member1, indexes, mask);
626 template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
628 gather(array, member1, member2, indexes);
630 template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
633 gather(array, member1, member2, indexes, mask);
635 template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
637 gather(array, ptrMember1, outerIndexes, innerIndexes);
639 template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask)
642 gather(array, ptrMember1, outerIndexes, innerIndexes, mask);
645 template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} };
646 template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size>
648 static void check() {
649 VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries);
652 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
654 IndexSizeChecker<Index, Size>::check();
655 d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
657 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
659 IndexSizeChecker<Index, Size>::check();
660 d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
661 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
663 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
665 IndexSizeChecker<Index, Size>::check();
666 d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
667 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
669 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
671 IndexSizeChecker<Index, Size>::check();
672 d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
673 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
675 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
677 IndexSizeChecker<Index, Size>::check();
678 d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
679 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
681 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
683 IndexSizeChecker<Index, Size>::check();
684 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
685 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
687 template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
689 IndexSizeChecker<Index, Size>::check();
690 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
691 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
694 #ifdef VC_USE_SET_GATHERS
695 template<typename T> template<typename IT> Vc_ALWAYS_INLINE void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IT>) indexes, MaskArg mask)
697 IndexSizeChecker<Vector<IT>, Size>::check();
698 Vector<IT> indexesTmp = indexes;
699 indexesTmp.setZero(!mask);
700 (*this)(mask) = Vector<T>(mem, indexesTmp);
704 #ifdef VC_USE_BSF_GATHERS
705 #define VC_MASKED_GATHER \
706 int bits = mask.toInt(); \
708 const int i = _bit_scan_forward(bits); \
709 bits &= ~(1 << i); /* btr? */ \
710 d.m(i) = ith_value(i); \
712 #elif defined(VC_USE_POPCNT_BSF_GATHERS)
713 #define VC_MASKED_GATHER \
714 unsigned int bits = mask.toInt(); \
715 unsigned int low, high = 0; \
716 switch (_mm_popcnt_u32(bits)) { \
718 high = _bit_scan_reverse(bits); \
719 d.m(high) = ith_value(high); \
720 high = (1 << high); \
722 low = _bit_scan_forward(bits); \
723 bits ^= high | (1 << low); \
724 d.m(low) = ith_value(low); \
726 high = _bit_scan_reverse(bits); \
727 d.m(high) = ith_value(high); \
728 high = (1 << high); \
730 low = _bit_scan_forward(bits); \
731 bits ^= high | (1 << low); \
732 d.m(low) = ith_value(low); \
734 high = _bit_scan_reverse(bits); \
735 d.m(high) = ith_value(high); \
736 high = (1 << high); \
738 low = _bit_scan_forward(bits); \
739 bits ^= high | (1 << low); \
740 d.m(low) = ith_value(low); \
742 high = _bit_scan_reverse(bits); \
743 d.m(high) = ith_value(high); \
745 low = _bit_scan_forward(bits); \
746 d.m(low) = ith_value(low); \
751 #define VC_MASKED_GATHER \
752 if (mask.isEmpty()) { \
755 for_all_vector_entries(i, \
756 if (mask[i]) d.m(i) = ith_value(i); \
760 template<typename T> template<typename Index>
761 Vc_INTRINSIC void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask)
763 IndexSizeChecker<Index, Size>::check();
764 #define ith_value(_i_) (mem[indexes[_i_]])
769 template<> template<typename S1, typename IT>
770 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
772 IndexSizeChecker<IT, Size>::check();
773 d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1),
774 array[indexes[2]].*(member1), array[indexes[3]].*(member1));
776 template<> template<typename S1, typename IT>
777 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
779 IndexSizeChecker<IT, Size>::check();
780 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
781 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
782 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
784 template<> template<typename S1, typename IT>
785 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
787 IndexSizeChecker<IT, Size>::check();
788 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
789 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
790 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
792 template<> template<typename S1, typename IT>
793 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
795 IndexSizeChecker<IT, Size>::check();
796 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
797 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
798 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
800 template<> template<typename S1, typename IT>
801 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
803 IndexSizeChecker<IT, Size>::check();
804 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
805 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
806 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
808 template<> template<typename S1, typename IT>
809 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
811 IndexSizeChecker<IT, Size>::check();
812 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
813 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
814 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
816 template<> template<typename S1, typename IT>
817 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
819 IndexSizeChecker<IT, Size>::check();
820 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
821 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
822 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
824 template<typename T> template<typename S1, typename IT>
825 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
827 IndexSizeChecker<IT, Size>::check();
828 #define ith_value(_i_) (array[indexes[_i_]].*(member1))
832 template<> template<typename S1, typename S2, typename IT>
833 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
835 IndexSizeChecker<IT, Size>::check();
836 d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
837 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
839 template<> template<typename S1, typename S2, typename IT>
840 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
842 IndexSizeChecker<IT, Size>::check();
843 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
844 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
845 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
847 template<> template<typename S1, typename S2, typename IT>
848 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
850 IndexSizeChecker<IT, Size>::check();
851 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
852 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
853 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
855 template<> template<typename S1, typename S2, typename IT>
856 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
858 IndexSizeChecker<IT, Size>::check();
859 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
860 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
861 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
863 template<> template<typename S1, typename S2, typename IT>
864 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
866 IndexSizeChecker<IT, Size>::check();
867 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
868 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
869 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
871 template<> template<typename S1, typename S2, typename IT>
872 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
874 IndexSizeChecker<IT, Size>::check();
875 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
876 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
877 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
879 template<> template<typename S1, typename S2, typename IT>
880 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
882 IndexSizeChecker<IT, Size>::check();
883 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
884 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
885 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
887 template<typename T> template<typename S1, typename S2, typename IT>
888 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
890 IndexSizeChecker<IT, Size>::check();
891 #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2))
895 template<> template<typename S1, typename IT1, typename IT2>
896 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
898 IndexSizeChecker<IT1, Size>::check();
899 IndexSizeChecker<IT2, Size>::check();
900 d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
901 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
903 template<> template<typename S1, typename IT1, typename IT2>
904 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
906 IndexSizeChecker<IT1, Size>::check();
907 IndexSizeChecker<IT2, Size>::check();
908 d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
909 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
910 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
911 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
913 template<> template<typename S1, typename IT1, typename IT2>
914 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
916 IndexSizeChecker<IT1, Size>::check();
917 IndexSizeChecker<IT2, Size>::check();
918 d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
919 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
920 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
921 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
923 template<> template<typename S1, typename IT1, typename IT2>
924 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
926 IndexSizeChecker<IT1, Size>::check();
927 IndexSizeChecker<IT2, Size>::check();
928 d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
929 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
930 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
931 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
933 template<> template<typename S1, typename IT1, typename IT2>
934 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
936 IndexSizeChecker<IT1, Size>::check();
937 IndexSizeChecker<IT2, Size>::check();
938 d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
939 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
940 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
941 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
943 template<> template<typename S1, typename IT1, typename IT2>
944 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
946 IndexSizeChecker<IT1, Size>::check();
947 IndexSizeChecker<IT2, Size>::check();
948 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
949 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
950 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
951 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
953 template<> template<typename S1, typename IT1, typename IT2>
954 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
956 IndexSizeChecker<IT1, Size>::check();
957 IndexSizeChecker<IT2, Size>::check();
958 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
959 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
960 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
961 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
963 template<typename T> template<typename S1, typename IT1, typename IT2>
964 Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask)
966 IndexSizeChecker<IT1, Size>::check();
967 IndexSizeChecker<IT2, Size>::check();
968 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
973 #undef VC_MASKED_GATHER
974 #ifdef VC_USE_BSF_SCATTERS
975 #define VC_MASKED_SCATTER \
976 int bits = mask.toInt(); \
978 const int i = _bit_scan_forward(bits); \
979 bits ^= (1 << i); /* btr? */ \
980 ith_value(i) = d.m(i); \
982 #elif defined(VC_USE_POPCNT_BSF_SCATTERS)
983 #define VC_MASKED_SCATTER \
984 unsigned int bits = mask.toInt(); \
985 unsigned int low, high = 0; \
986 switch (_mm_popcnt_u32(bits)) { \
988 high = _bit_scan_reverse(bits); \
989 ith_value(high) = d.m(high); \
990 high = (1 << high); \
992 low = _bit_scan_forward(bits); \
993 bits ^= high | (1 << low); \
994 ith_value(low) = d.m(low); \
996 high = _bit_scan_reverse(bits); \
997 ith_value(high) = d.m(high); \
998 high = (1 << high); \
1000 low = _bit_scan_forward(bits); \
1001 bits ^= high | (1 << low); \
1002 ith_value(low) = d.m(low); \
1004 high = _bit_scan_reverse(bits); \
1005 ith_value(high) = d.m(high); \
1006 high = (1 << high); \
1008 low = _bit_scan_forward(bits); \
1009 bits ^= high | (1 << low); \
1010 ith_value(low) = d.m(low); \
1012 high = _bit_scan_reverse(bits); \
1013 ith_value(high) = d.m(high); \
1015 low = _bit_scan_forward(bits); \
1016 ith_value(low) = d.m(low); \
1021 #define VC_MASKED_SCATTER \
1022 if (mask.isEmpty()) { \
1025 for_all_vector_entries(i, \
1026 if (mask[i]) ith_value(i) = d.m(i); \
1030 template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const
1032 for_all_vector_entries(i,
1033 mem[indexes[i]] = d.m(i);
1036 #if defined(VC_MSVC) && VC_MSVC >= 170000000
1037 // MSVC miscompiles the store mem[indexes[1]] = d.m(1) for T = (u)short
1038 template<> template<typename Index> Vc_ALWAYS_INLINE void short_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const
1040 const unsigned int tmp = d.v()._d.m128i_u32[0];
1041 mem[indexes[0]] = tmp & 0xffff;
1042 mem[indexes[1]] = tmp >> 16;
1043 mem[indexes[2]] = _mm_extract_epi16(d.v(), 2);
1044 mem[indexes[3]] = _mm_extract_epi16(d.v(), 3);
1045 mem[indexes[4]] = _mm_extract_epi16(d.v(), 4);
1046 mem[indexes[5]] = _mm_extract_epi16(d.v(), 5);
1047 mem[indexes[6]] = _mm_extract_epi16(d.v(), 6);
1048 mem[indexes[7]] = _mm_extract_epi16(d.v(), 7);
1050 template<> template<typename Index> Vc_ALWAYS_INLINE void ushort_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const
1052 const unsigned int tmp = d.v()._d.m128i_u32[0];
1053 mem[indexes[0]] = tmp & 0xffff;
1054 mem[indexes[1]] = tmp >> 16;
1055 mem[indexes[2]] = _mm_extract_epi16(d.v(), 2);
1056 mem[indexes[3]] = _mm_extract_epi16(d.v(), 3);
1057 mem[indexes[4]] = _mm_extract_epi16(d.v(), 4);
1058 mem[indexes[5]] = _mm_extract_epi16(d.v(), 5);
1059 mem[indexes[6]] = _mm_extract_epi16(d.v(), 6);
1060 mem[indexes[7]] = _mm_extract_epi16(d.v(), 7);
1063 template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const
1065 #define ith_value(_i_) mem[indexes[_i_]]
1069 template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const
1071 for_all_vector_entries(i,
1072 array[indexes[i]].*(member1) = d.m(i);
1075 template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const
1077 #define ith_value(_i_) array[indexes[_i_]].*(member1)
1081 template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const
1083 for_all_vector_entries(i,
1084 array[indexes[i]].*(member1).*(member2) = d.m(i);
1087 template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const
1089 #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2)
1093 template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const
1095 for_all_vector_entries(i,
1096 (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i);
1099 template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const
1101 #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1106 ///////////////////////////////////////////////////////////////////////////////////////////
1108 template<> Vc_ALWAYS_INLINE Vector<double> Vc_PURE Vc_FLATTEN Vector<double>::operator-() const
1110 return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd());
1112 template<> Vc_ALWAYS_INLINE Vector<float> Vc_PURE Vc_FLATTEN Vector<float>::operator-() const
1114 return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps());
1116 template<> Vc_ALWAYS_INLINE Vector<sfloat> Vc_PURE Vc_FLATTEN Vector<sfloat>::operator-() const
1118 return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps());
1120 template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<int>::operator-() const
1122 return _mm256_sign_epi32(d.v(), _mm256_setallone_si256());
1124 template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<unsigned int>::operator-() const
1126 return _mm256_sign_epi32(d.v(), _mm256_setallone_si256());
1128 template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<short>::operator-() const
1130 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
1132 template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<unsigned short>::operator-() const
1134 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
1137 ///////////////////////////////////////////////////////////////////////////////////////////
1138 // horizontal ops {{{1
1139 template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const
1141 Vector<T> tmp = std::numeric_limits<Vector<T> >::max();
1145 template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const
1147 Vector<T> tmp = std::numeric_limits<Vector<T> >::min();
1151 template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const
1153 Vector<T> tmp(VectorSpecialInitializerOne::One);
1155 return tmp.product();
1157 template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const
1159 Vector<T> tmp(VectorSpecialInitializerZero::Zero);
1164 template<> Vc_INTRINSIC Vector<float> Vector<float>::copySign(Vector<float>::AsArg reference) const
1166 return _mm256_or_ps(
1167 _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()),
1168 _mm256_and_ps(d.v(), _mm256_setabsmask_ps())
1171 template<> Vc_INTRINSIC Vector<sfloat> Vector<sfloat>::copySign(Vector<sfloat>::AsArg reference) const
1173 return _mm256_or_ps(
1174 _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()),
1175 _mm256_and_ps(d.v(), _mm256_setabsmask_ps())
1178 template<> Vc_INTRINSIC Vector<double> Vector<double>::copySign(Vector<double>::AsArg reference) const
1180 return _mm256_or_pd(
1181 _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()),
1182 _mm256_and_pd(d.v(), _mm256_setabsmask_pd())
1186 template<> Vc_INTRINSIC Vector<float> Vector<float>::exponent() const
1188 VC_ASSERT((*this >= 0.f).isFull());
1189 return Internal::exponent(d.v());
1191 template<> Vc_INTRINSIC Vector<sfloat> Vector<sfloat>::exponent() const
1193 VC_ASSERT((*this >= 0.f).isFull());
1194 return Internal::exponent(d.v());
1196 template<> Vc_INTRINSIC Vector<double> Vector<double>::exponent() const
1198 VC_ASSERT((*this >= 0.).isFull());
1199 return Internal::exponent(d.v());
1203 static Vc_ALWAYS_INLINE void _doRandomStep(Vector<unsigned int> &state0,
1204 Vector<unsigned int> &state1)
1206 state0.load(&Vc::RandomState[0]);
1207 state1.load(&Vc::RandomState[uint_v::Size]);
1208 (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]);
1209 uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]);
1212 template<typename T> Vc_ALWAYS_INLINE Vector<T> Vector<T>::Random()
1214 Vector<unsigned int> state0, state1;
1215 _doRandomStep(state0, state1);
1216 return state0.reinterpretCast<Vector<T> >();
1219 template<> Vc_ALWAYS_INLINE Vector<float> Vector<float>::Random()
1221 Vector<unsigned int> state0, state1;
1222 _doRandomStep(state0, state1);
1223 return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1226 template<> Vc_ALWAYS_INLINE Vector<sfloat> Vector<sfloat>::Random()
1228 Vector<unsigned int> state0, state1;
1229 _doRandomStep(state0, state1);
1230 return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1233 template<> Vc_ALWAYS_INLINE Vector<double> Vector<double>::Random()
1235 const m256i state = VectorHelper<m256i>::load(&Vc::RandomState[0], Vc::Aligned);
1236 for (size_t k = 0; k < 8; k += 2) {
1237 typedef unsigned long long uint64 Vc_MAY_ALIAS;
1238 const uint64 stateX = *reinterpret_cast<const uint64 *>(&Vc::RandomState[k]);
1239 *reinterpret_cast<uint64 *>(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11);
1241 return (Vector<double>(_cast(_mm256_srli_epi64(state, 12))) | One()) - One();
1244 // shifted / rotated {{{1
1245 template<size_t SIMDWidth, size_t Size, typename VectorType, typename EntryType> struct VectorShift;
1246 template<> struct VectorShift<32, 4, m256d, double>
1248 static Vc_INTRINSIC m256d shifted(param256d v, int amount)
1252 case 1: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 1 * sizeof(double)));
1253 case 2: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 2 * sizeof(double)));
1254 case 3: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 3 * sizeof(double)));
1255 case -1: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 1 * sizeof(double)));
1256 case -2: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 2 * sizeof(double)));
1257 case -3: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 3 * sizeof(double)));
1259 return _mm256_setzero_pd();
1262 template<typename VectorType, typename EntryType> struct VectorShift<32, 8, VectorType, EntryType>
1264 typedef typename SseVectorType<VectorType>::Type SmallV;
1265 static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1269 case 1: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 1 * sizeof(EntryType)));
1270 case 2: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 2 * sizeof(EntryType)));
1271 case 3: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 3 * sizeof(EntryType)));
1272 case 4: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 4 * sizeof(EntryType)));
1273 case 5: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 5 * sizeof(EntryType)));
1274 case 6: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 6 * sizeof(EntryType)));
1275 case 7: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 7 * sizeof(EntryType)));
1276 case -1: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 1 * sizeof(EntryType)));
1277 case -2: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 2 * sizeof(EntryType)));
1278 case -3: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 3 * sizeof(EntryType)));
1279 case -4: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 4 * sizeof(EntryType)));
1280 case -5: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 5 * sizeof(EntryType)));
1281 case -6: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 6 * sizeof(EntryType)));
1282 case -7: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 7 * sizeof(EntryType)));
1284 return avx_cast<VectorType>(_mm256_setzero_ps());
1287 template<typename VectorType, typename EntryType> struct VectorShift<16, 8, VectorType, EntryType>
1289 static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1293 case 1: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 1 * sizeof(EntryType)));
1294 case 2: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 2 * sizeof(EntryType)));
1295 case 3: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 3 * sizeof(EntryType)));
1296 case 4: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 4 * sizeof(EntryType)));
1297 case 5: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 5 * sizeof(EntryType)));
1298 case 6: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 6 * sizeof(EntryType)));
1299 case 7: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 7 * sizeof(EntryType)));
1300 case -1: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 1 * sizeof(EntryType)));
1301 case -2: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 2 * sizeof(EntryType)));
1302 case -3: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 3 * sizeof(EntryType)));
1303 case -4: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 4 * sizeof(EntryType)));
1304 case -5: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 5 * sizeof(EntryType)));
1305 case -6: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 6 * sizeof(EntryType)));
1306 case -7: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 7 * sizeof(EntryType)));
1308 return _mm_setzero_si128();
1311 template<typename T> Vc_INTRINSIC Vector<T> Vector<T>::shifted(int amount) const
1313 return VectorShift<sizeof(VectorType), Size, VectorType, EntryType>::shifted(d.v(), amount);
1315 template<size_t SIMDWidth, size_t Size, typename VectorType, typename EntryType> struct VectorRotate;
1316 template<typename VectorType, typename EntryType> struct VectorRotate<32, 4, VectorType, EntryType>
1318 typedef typename SseVectorType<VectorType>::Type SmallV;
1319 static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1321 const m128i vLo = avx_cast<m128i>(lo128(v));
1322 const m128i vHi = avx_cast<m128i>(hi128(v));
1323 switch (static_cast<unsigned int>(amount) % 4) {
1325 case 1: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * sizeof(EntryType))), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * sizeof(EntryType))));
1326 case 2: return Mem::permute128<X1, X0>(v);
1327 case 3: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * sizeof(EntryType))), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * sizeof(EntryType))));
1329 return _mm256_setzero_pd();
1332 template<typename VectorType, typename EntryType> struct VectorRotate<32, 8, VectorType, EntryType>
1334 typedef typename SseVectorType<VectorType>::Type SmallV;
1335 static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1337 const m128i vLo = avx_cast<m128i>(lo128(v));
1338 const m128i vHi = avx_cast<m128i>(hi128(v));
1339 switch (static_cast<unsigned int>(amount) % 8) {
1341 case 1: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * sizeof(EntryType))), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * sizeof(EntryType))));
1342 case 2: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 2 * sizeof(EntryType))), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 2 * sizeof(EntryType))));
1343 case 3: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 3 * sizeof(EntryType))), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 3 * sizeof(EntryType))));
1344 case 4: return Mem::permute128<X1, X0>(v);
1345 case 5: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * sizeof(EntryType))), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * sizeof(EntryType))));
1346 case 6: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 2 * sizeof(EntryType))), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 2 * sizeof(EntryType))));
1347 case 7: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 3 * sizeof(EntryType))), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 3 * sizeof(EntryType))));
1349 return avx_cast<VectorType>(_mm256_setzero_ps());
1352 template<typename VectorType, typename EntryType> struct VectorRotate<16, 8, VectorType, EntryType>
1354 static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1356 switch (static_cast<unsigned int>(amount) % 8) {
1358 case 1: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 1 * sizeof(EntryType)));
1359 case 2: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 2 * sizeof(EntryType)));
1360 case 3: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 3 * sizeof(EntryType)));
1361 case 4: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 4 * sizeof(EntryType)));
1362 case 5: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 5 * sizeof(EntryType)));
1363 case 6: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 6 * sizeof(EntryType)));
1364 case 7: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 7 * sizeof(EntryType)));
1366 return _mm_setzero_si128();
1369 template<typename T> Vc_INTRINSIC Vector<T> Vector<T>::rotated(int amount) const
1371 return VectorRotate<sizeof(VectorType), Size, VectorType, EntryType>::rotated(d.v(), amount);
1373 const m128i v0 = avx_cast<m128i>(d.v()[0]);
1374 const m128i v1 = avx_cast<m128i>(d.v()[1]);
1375 switch (static_cast<unsigned int>(amount) % Size) {
1376 case 0: return *this;
1377 case 1: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))));
1378 case 2: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))));
1379 case 3: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))));
1380 case 4: return concat(d.v()[1], d.v()[0]);
1381 case 5: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))));
1382 case 6: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))));
1383 case 7: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))));
1390 } // namespace AliRoot
1392 #include "undomacros.h"
1394 // vim: foldmethod=marker