]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/include/Vc/avx/vector.tcc
Vc: update to 0.7.4 release
[u/mrichter/AliRoot.git] / Vc / include / Vc / avx / vector.tcc
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
20#include "limits.h"
21#include "const.h"
22#include "macros.h"
23
c017a39f 24namespace AliRoot {
f22341db 25namespace Vc
26{
27ALIGN(64) extern unsigned int RandomState[16];
28
29namespace AVX
30{
31
32///////////////////////////////////////////////////////////////////////////////////////////
33// constants {{{1
c017a39f 34template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {}
35template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {}
36template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum)
f22341db 37 : d(HV::load(IndexesFromZeroData<T>::address(), Aligned)) {}
38
c017a39f 39template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::Zero() { return HT::zero(); }
40template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::One() { return HT::one(); }
41template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::IndexesFromZero() { return HV::load(IndexesFromZeroData<T>::address(), Aligned); }
f22341db 42
c017a39f 43template<typename T> template<typename T2> Vc_ALWAYS_INLINE Vector<T>::Vector(VC_ALIGNED_PARAMETER(Vector<T2>) x)
f22341db 44 : d(StaticCastHelper<T2, T>::cast(x.data())) {}
45
c017a39f 46template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(EntryType x) : d(HT::set(x)) {}
47template<> Vc_ALWAYS_INLINE Vector<double>::Vector(EntryType x) : d(_mm256_set1_pd(x)) {}
f22341db 48
49
50///////////////////////////////////////////////////////////////////////////////////////////
51// load ctors {{{1
c017a39f 52template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); }
53template<typename T> template<typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); }
54template<typename T> template<typename OtherT> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); }
55template<typename T> template<typename OtherT, typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); }
f22341db 56
57///////////////////////////////////////////////////////////////////////////////////////////
58// load member functions {{{1
c017a39f 59template<typename T> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem)
f22341db 60{
61 load(mem, Aligned);
62}
63
c017a39f 64template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem, A align)
f22341db 65{
66 d.v() = HV::load(mem, align);
67}
68
c017a39f 69template<typename T> template<typename OtherT> Vc_INTRINSIC void Vector<T>::load(const OtherT *mem)
f22341db 70{
71 load(mem, Aligned);
72}
73
74// LoadHelper {{{2
75template<typename DstT, typename SrcT, typename Flags> struct LoadHelper;
76
77// float {{{2
78template<typename Flags> struct LoadHelper<float, double, Flags> {
c017a39f 79 static m256 load(const double *mem, Flags f)
f22341db 80 {
c017a39f 81 return concat(_mm256_cvtpd_ps(VectorHelper<m256d>::load(&mem[0], f)),
82 _mm256_cvtpd_ps(VectorHelper<m256d>::load(&mem[4], f)));
f22341db 83 }
84};
85template<typename Flags> struct LoadHelper<float, unsigned int, Flags> {
c017a39f 86 static m256 load(const unsigned int *mem, Flags f)
f22341db 87 {
c017a39f 88 return StaticCastHelper<unsigned int, float>::cast(VectorHelper<m256i>::load(mem, f));
f22341db 89 }
90};
91template<typename Flags> struct LoadHelper<float, int, Flags> {
c017a39f 92 static m256 load(const int *mem, Flags f)
f22341db 93 {
c017a39f 94 return StaticCastHelper<int, float>::cast(VectorHelper<m256i>::load(mem, f));
f22341db 95 }
96};
97template<typename Flags> struct LoadHelper<float, unsigned short, Flags> {
c017a39f 98 static m256 load(const unsigned short *mem, Flags f)
f22341db 99 {
c017a39f 100 return StaticCastHelper<unsigned short, float>::cast(VectorHelper<m128i>::load(mem, f));
f22341db 101 }
102};
103template<typename Flags> struct LoadHelper<float, short, Flags> {
c017a39f 104 static m256 load(const short *mem, Flags f)
f22341db 105 {
c017a39f 106 return StaticCastHelper<short, float>::cast(VectorHelper<m128i>::load(mem, f));
f22341db 107 }
108};
109template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
c017a39f 110 static m256 load(const unsigned char *mem, Flags f)
f22341db 111 {
112 return StaticCastHelper<unsigned int, float>::cast(LoadHelper<unsigned int, unsigned char, Flags>::load(mem, f));
113 }
114};
115template<typename Flags> struct LoadHelper<float, signed char, Flags> {
c017a39f 116 static m256 load(const signed char *mem, Flags f)
f22341db 117 {
118 return StaticCastHelper<int, float>::cast(LoadHelper<int, signed char, Flags>::load(mem, f));
119 }
120};
121
122template<typename SrcT, typename Flags> struct LoadHelper<sfloat, SrcT, Flags> : public LoadHelper<float, SrcT, Flags> {};
123
124// int {{{2
125template<typename Flags> struct LoadHelper<int, unsigned int, Flags> {
c017a39f 126 static m256i load(const unsigned int *mem, Flags f)
f22341db 127 {
c017a39f 128 return VectorHelper<m256i>::load(mem, f);
f22341db 129 }
130};
131template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
c017a39f 132 static m256i load(const unsigned short *mem, Flags f)
f22341db 133 {
c017a39f 134 return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<m128i>::load(mem, f));
f22341db 135 }
136};
137template<typename Flags> struct LoadHelper<int, short, Flags> {
c017a39f 138 static m256i load(const short *mem, Flags f)
f22341db 139 {
c017a39f 140 return StaticCastHelper<short, int>::cast(VectorHelper<m128i>::load(mem, f));
f22341db 141 }
142};
143template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
c017a39f 144 static m256i load(const unsigned char *mem, Flags)
f22341db 145 {
146 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
147 // it, or we risk an out-of-bounds read and an unaligned load exception
c017a39f 148 const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
149 const m128i epu16 = _mm_cvtepu8_epi16(epu8);
f22341db 150 return StaticCastHelper<unsigned short, unsigned int>::cast(epu16);
151 }
152};
153template<typename Flags> struct LoadHelper<int, signed char, Flags> {
c017a39f 154 static m256i load(const signed char *mem, Flags)
f22341db 155 {
156 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
157 // it, or we risk an out-of-bounds read and an unaligned load exception
c017a39f 158 const m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
159 const m128i epi16 = _mm_cvtepi8_epi16(epi8);
f22341db 160 return StaticCastHelper<short, int>::cast(epi16);
161 }
162};
163
164// unsigned int {{{2
165template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
c017a39f 166 static m256i load(const unsigned short *mem, Flags f)
f22341db 167 {
c017a39f 168 return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<m128i>::load(mem, f));
f22341db 169 }
170};
171template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
c017a39f 172 static m256i load(const unsigned char *mem, Flags)
f22341db 173 {
174 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
175 // it, or we risk an out-of-bounds read and an unaligned load exception
c017a39f 176 const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
177 const m128i epu16 = _mm_cvtepu8_epi16(epu8);
f22341db 178 return StaticCastHelper<unsigned short, unsigned int>::cast(epu16);
179 }
180};
181
182// short {{{2
183template<typename Flags> struct LoadHelper<short, unsigned short, Flags> {
c017a39f 184 static m128i load(const unsigned short *mem, Flags f)
f22341db 185 {
c017a39f 186 return StaticCastHelper<unsigned short, short>::cast(VectorHelper<m128i>::load(mem, f));
f22341db 187 }
188};
189template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
c017a39f 190 static m128i load(const unsigned char *mem, Flags)
f22341db 191 {
192 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
193 // it, or we risk an out-of-bounds read and an unaligned load exception
c017a39f 194 const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
f22341db 195 return _mm_cvtepu8_epi16(epu8);
196 }
197};
198template<typename Flags> struct LoadHelper<short, signed char, Flags> {
c017a39f 199 static m128i load(const signed char *mem, Flags)
f22341db 200 {
201 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
202 // it, or we risk an out-of-bounds read and an unaligned load exception
c017a39f 203 const m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
f22341db 204 return _mm_cvtepi8_epi16(epi8);
205 }
206};
207
208// unsigned short {{{2
209template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
c017a39f 210 static m128i load(const unsigned char *mem, Flags)
f22341db 211 {
212 // the only available streaming load loads 16 bytes - twice as much as we need => can't use
213 // it, or we risk an out-of-bounds read and an unaligned load exception
c017a39f 214 const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem));
f22341db 215 return _mm_cvtepu8_epi16(epu8);
216 }
217};
218
219// general load, implemented via LoadHelper {{{2
c017a39f 220template<typename DstT> template<typename SrcT, typename Flags> Vc_INTRINSIC void Vector<DstT>::load(const SrcT *x, Flags f)
f22341db 221{
222 d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f);
223}
224
225///////////////////////////////////////////////////////////////////////////////////////////
226// zeroing {{{1
c017a39f 227template<typename T> Vc_INTRINSIC void Vector<T>::setZero()
f22341db 228{
229 data() = HV::zero();
230}
c017a39f 231template<typename T> Vc_INTRINSIC void Vector<T>::setZero(const Mask &k)
f22341db 232{
233 data() = HV::andnot_(avx_cast<VectorType>(k.data()), data());
234}
235
c017a39f 236template<> Vc_INTRINSIC void Vector<double>::setQnan()
f22341db 237{
238 data() = _mm256_setallone_pd();
239}
c017a39f 240template<> Vc_INTRINSIC void Vector<double>::setQnan(MaskArg k)
f22341db 241{
242 data() = _mm256_or_pd(data(), k.dataD());
243}
c017a39f 244template<> Vc_INTRINSIC void Vector<float>::setQnan()
f22341db 245{
246 data() = _mm256_setallone_ps();
247}
c017a39f 248template<> Vc_INTRINSIC void Vector<float>::setQnan(MaskArg k)
f22341db 249{
250 data() = _mm256_or_ps(data(), k.data());
251}
c017a39f 252template<> Vc_INTRINSIC void Vector<sfloat>::setQnan()
f22341db 253{
254 data() = _mm256_setallone_ps();
255}
c017a39f 256template<> Vc_INTRINSIC void Vector<sfloat>::setQnan(MaskArg k)
f22341db 257{
258 data() = _mm256_or_ps(data(), k.data());
259}
260
261///////////////////////////////////////////////////////////////////////////////////////////
262// stores {{{1
c017a39f 263template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem) const
f22341db 264{
265 HV::store(mem, data(), Aligned);
266}
c017a39f 267template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask) const
f22341db 268{
269 HV::store(mem, data(), avx_cast<VectorType>(mask.data()), Aligned);
270}
c017a39f 271template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, A align) const
f22341db 272{
273 HV::store(mem, data(), align);
274}
c017a39f 275template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask, A align) const
f22341db 276{
277 HV::store(mem, data(), avx_cast<VectorType>(mask.data()), align);
278}
279
280///////////////////////////////////////////////////////////////////////////////////////////
281// expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1
c017a39f 282template<typename T> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<T>::Vector(const Vector<typename HT::ConcatType> *a)
f22341db 283 : d(a[0])
284{
285}
c017a39f 286template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<float>::Vector(const Vector<HT::ConcatType> *a)
f22341db 287 : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data())))
288{
289}
c017a39f 290template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<short>::Vector(const Vector<HT::ConcatType> *a)
f22341db 291 : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data())))
292{
293}
c017a39f 294template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<unsigned short>::Vector(const Vector<HT::ConcatType> *a)
f22341db 295 : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data())))
296{
297}
c017a39f 298template<typename T> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::expand(Vector<typename HT::ConcatType> *x) const
f22341db 299{
300 x[0] = *this;
301}
c017a39f 302template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::expand(Vector<HT::ConcatType> *x) const
f22341db 303{
304 x[0].data() = _mm256_cvtps_pd(lo128(d.v()));
305 x[1].data() = _mm256_cvtps_pd(hi128(d.v()));
306}
c017a39f 307template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::expand(Vector<HT::ConcatType> *x) const
f22341db 308{
309 x[0].data() = concat(_mm_cvtepi16_epi32(d.v()),
310 _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v())));
311}
c017a39f 312template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::expand(Vector<HT::ConcatType> *x) const
f22341db 313{
314 x[0].data() = concat(_mm_cvtepu16_epi32(d.v()),
315 _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v())));
316}
317
318///////////////////////////////////////////////////////////////////////////////////////////
319// swizzles {{{1
c017a39f 320template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE &Vector<T>::abcd() const { return *this; }
321template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); }
322template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
323template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); }
324template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); }
325template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); }
326template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); }
327template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); }
328template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); }
329template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); }
330template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); }
331template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); }
332template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); }
f22341db 333
c017a39f 334template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::cdab() const { return Mem::shuffle128<X1, X0>(data(), data()); }
335template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
336template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); }
337template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); }
338template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); }
339template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); }
340template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bcad() const { return Mem::shuffle<X1, Y0, X2, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); }
341template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bcda() const { return Mem::shuffle<X1, Y0, X3, Y2>(data(), Mem::shuffle128<X1, X0>(data(), data())); }
342template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dabc() const { return Mem::shuffle<X1, Y0, X3, Y2>(Mem::shuffle128<X1, X0>(data(), data()), data()); }
343template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::acbd() const { return Mem::shuffle<X0, Y0, X3, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); }
344template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dbca() const { return Mem::shuffle<X1, Y1, X2, Y2>(Mem::shuffle128<X1, X1>(data(), data()), Mem::shuffle128<X0, X0>(data(), data())); }
345template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dcba() const { return cdab().badc(); }
f22341db 346
347#define VC_SWIZZLES_16BIT_IMPL(T) \
c017a39f 348template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \
349template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \
350template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \
351template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \
352template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \
353template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \
354template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \
355template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \
356template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \
357template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \
358template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \
359template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); }
f22341db 360VC_SWIZZLES_16BIT_IMPL(short)
361VC_SWIZZLES_16BIT_IMPL(unsigned short)
362#undef VC_SWIZZLES_16BIT_IMPL
363
364///////////////////////////////////////////////////////////////////////////////////////////
365// division {{{1
366template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x)
367{
368 if (HasVectorDivision) {
369 return operator/=(Vector<T>(x));
370 }
371 for_all_vector_entries(i,
372 d.m(i) /= x;
373 );
374 return *this;
375}
c017a39f 376template<typename T> template<typename TT> inline Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const
f22341db 377{
378 if (HasVectorDivision) {
379 return operator/(Vector<T>(x));
380 }
381 Vector<T> r;
382 for_all_vector_entries(i,
383 r.d.m(i) = d.m(i) / x;
384 );
385 return r;
386}
387// per default fall back to scalar division
388template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x)
389{
390 for_all_vector_entries(i,
391 d.m(i) /= x.d.m(i);
392 );
393 return *this;
394}
395
c017a39f 396template<typename T> inline Vector<T> Vc_PURE Vector<T>::operator/(const Vector<T> &x) const
f22341db 397{
398 Vector<T> r;
399 for_all_vector_entries(i,
400 r.d.m(i) = d.m(i) / x.d.m(i);
401 );
402 return r;
403}
404// specialize division on type
c017a39f 405static Vc_INTRINSIC m256i Vc_CONST divInt(param256i a, param256i b) {
406 const m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
407 const m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
408 const m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
409 const m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
f22341db 410 return concat(
411 _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
412 _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2))
413 );
414}
415template<> inline Vector<int> &Vector<int>::operator/=(const Vector<int> &x)
416{
417 d.v() = divInt(d.v(), x.d.v());
418 return *this;
419}
c017a39f 420template<> inline Vector<int> Vc_PURE Vector<int>::operator/(const Vector<int> &x) const
f22341db 421{
422 return divInt(d.v(), x.d.v());
423}
c017a39f 424static inline m256i Vc_CONST divUInt(param256i a, param256i b) {
425 m256d loa = _mm256_cvtepi32_pd(lo128(a));
426 m256d hia = _mm256_cvtepi32_pd(hi128(a));
427 m256d lob = _mm256_cvtepi32_pd(lo128(b));
428 m256d hib = _mm256_cvtepi32_pd(hi128(b));
f22341db 429 // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32)
430 // to get the right number back we have to add 2^32 where a >= 2^31
431 loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.)));
432 hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.)));
433 // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and
434 // we rather want the standard stuff fast
435 //
436 // there is one remaining problem: a >= 2^31 and b == 1
437 // in that case the return value would be 2^31
c017a39f 438 return avx_cast<m256i>(_mm256_blendv_ps(avx_cast<m256>(concat(
f22341db 439 _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
440 _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib))
c017a39f 441 )), avx_cast<m256>(a), avx_cast<m256>(concat(
f22341db 442 _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()),
443 _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32())))));
444}
c017a39f 445template<> Vc_ALWAYS_INLINE Vector<unsigned int> &Vector<unsigned int>::operator/=(const Vector<unsigned int> &x)
f22341db 446{
447 d.v() = divUInt(d.v(), x.d.v());
448 return *this;
449}
c017a39f 450template<> Vc_ALWAYS_INLINE Vector<unsigned int> Vc_PURE Vector<unsigned int>::operator/(const Vector<unsigned int> &x) const
f22341db 451{
452 return divUInt(d.v(), x.d.v());
453}
c017a39f 454template<typename T> static inline m128i Vc_CONST divShort(param128i a, param128i b)
f22341db 455{
c017a39f 456 const m256 r = _mm256_div_ps(StaticCastHelper<T, float>::cast(a),
f22341db 457 StaticCastHelper<T, float>::cast(b));
458 return StaticCastHelper<float, T>::cast(r);
459}
c017a39f 460template<> Vc_ALWAYS_INLINE Vector<short> &Vector<short>::operator/=(const Vector<short> &x)
f22341db 461{
462 d.v() = divShort<short>(d.v(), x.d.v());
463 return *this;
464}
c017a39f 465template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vector<short>::operator/(const Vector<short> &x) const
f22341db 466{
467 return divShort<short>(d.v(), x.d.v());
468}
c017a39f 469template<> Vc_ALWAYS_INLINE Vector<unsigned short> &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x)
f22341db 470{
471 d.v() = divShort<unsigned short>(d.v(), x.d.v());
472 return *this;
473}
c017a39f 474template<> Vc_ALWAYS_INLINE Vector<unsigned short> Vc_PURE Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const
f22341db 475{
476 return divShort<unsigned short>(d.v(), x.d.v());
477}
c017a39f 478template<> Vc_INTRINSIC float_v &float_v::operator/=(const float_v &x)
f22341db 479{
480 d.v() = _mm256_div_ps(d.v(), x.d.v());
481 return *this;
482}
c017a39f 483template<> Vc_INTRINSIC float_v Vc_PURE float_v::operator/(const float_v &x) const
f22341db 484{
485 return _mm256_div_ps(d.v(), x.d.v());
486}
c017a39f 487template<> Vc_INTRINSIC sfloat_v &sfloat_v::operator/=(const sfloat_v &x)
488{
489 d.v() = _mm256_div_ps(d.v(), x.d.v());
490 return *this;
491}
492template<> Vc_INTRINSIC sfloat_v Vc_PURE sfloat_v::operator/(const sfloat_v &x) const
493{
494 return _mm256_div_ps(d.v(), x.d.v());
495}
496template<> Vc_INTRINSIC double_v &double_v::operator/=(const double_v &x)
f22341db 497{
498 d.v() = _mm256_div_pd(d.v(), x.d.v());
499 return *this;
500}
c017a39f 501template<> Vc_INTRINSIC double_v Vc_PURE double_v::operator/(const double_v &x) const
f22341db 502{
503 return _mm256_div_pd(d.v(), x.d.v());
504}
505
506///////////////////////////////////////////////////////////////////////////////////////////
507// integer ops {{{1
508#define OP_IMPL(T, symbol) \
c017a39f 509template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(AsArg x) \
f22341db 510{ \
511 for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \
512 return *this; \
513} \
c017a39f 514template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator symbol(AsArg x) const \
f22341db 515{ \
516 Vector<T> r; \
517 for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \
518 return r; \
519}
520OP_IMPL(int, <<)
521OP_IMPL(int, >>)
522OP_IMPL(unsigned int, <<)
523OP_IMPL(unsigned int, >>)
524OP_IMPL(short, <<)
525OP_IMPL(short, >>)
526OP_IMPL(unsigned short, <<)
527OP_IMPL(unsigned short, >>)
528#undef OP_IMPL
529
c017a39f 530template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator>>=(int shift) {
f22341db 531 d.v() = VectorHelper<T>::shiftRight(d.v(), shift);
532 return *static_cast<Vector<T> *>(this);
533}
c017a39f 534template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator>>(int shift) const {
f22341db 535 return VectorHelper<T>::shiftRight(d.v(), shift);
536}
c017a39f 537template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator<<=(int shift) {
f22341db 538 d.v() = VectorHelper<T>::shiftLeft(d.v(), shift);
539 return *static_cast<Vector<T> *>(this);
540}
c017a39f 541template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator<<(int shift) const {
f22341db 542 return VectorHelper<T>::shiftLeft(d.v(), shift);
543}
544
545#define OP_IMPL(T, symbol, fun) \
c017a39f 546 template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \
547 template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator symbol(AsArg x) const { return Vector<T>(HV::fun(d.v(), x.d.v())); }
f22341db 548 OP_IMPL(int, &, and_)
549 OP_IMPL(int, |, or_)
550 OP_IMPL(int, ^, xor_)
551 OP_IMPL(unsigned int, &, and_)
552 OP_IMPL(unsigned int, |, or_)
553 OP_IMPL(unsigned int, ^, xor_)
554 OP_IMPL(short, &, and_)
555 OP_IMPL(short, |, or_)
556 OP_IMPL(short, ^, xor_)
557 OP_IMPL(unsigned short, &, and_)
558 OP_IMPL(unsigned short, |, or_)
559 OP_IMPL(unsigned short, ^, xor_)
560 OP_IMPL(float, &, and_)
561 OP_IMPL(float, |, or_)
562 OP_IMPL(float, ^, xor_)
563 OP_IMPL(sfloat, &, and_)
564 OP_IMPL(sfloat, |, or_)
565 OP_IMPL(sfloat, ^, xor_)
566 OP_IMPL(double, &, and_)
567 OP_IMPL(double, |, or_)
568 OP_IMPL(double, ^, xor_)
569#undef OP_IMPL
570
571// operators {{{1
572#include "../common/operators.h"
c017a39f 573// isNegative {{{1
574template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const
575{
576 return avx_cast<m256>(_mm256_srai_epi32(avx_cast<m256i>(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31));
577}
578template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const
579{
580 return avx_cast<m256>(_mm256_srai_epi32(avx_cast<m256i>(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31));
581}
582template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const
583{
584 return Mem::permute<X1, X1, X3, X3>(avx_cast<m256>(
585 _mm256_srai_epi32(avx_cast<m256i>(_mm256_and_pd(_mm256_setsignmask_pd(), d.v())), 31)
586 ));
587}
f22341db 588// gathers {{{1
589// Better implementation (hopefully) with _mm256_set_
590//X template<typename T> template<typename Index> Vector<T>::Vector(const EntryType *mem, const Index *indexes)
591//X {
592//X for_all_vector_entries(int i,
593//X d.m(i) = mem[indexes[i]];
594//X );
595//X }
c017a39f 596template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes)
f22341db 597{
598 gather(mem, indexes);
599}
c017a39f 600template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes)
f22341db 601{
602 gather(mem, indexes);
603}
604
c017a39f 605template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask)
f22341db 606 : d(HT::zero())
607{
608 gather(mem, indexes, mask);
609}
610
c017a39f 611template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes, MaskArg mask)
f22341db 612 : d(HT::zero())
613{
614 gather(mem, indexes, mask);
615}
616
c017a39f 617template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 618{
619 gather(array, member1, indexes);
620}
c017a39f 621template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
f22341db 622 : d(HT::zero())
623{
624 gather(array, member1, indexes, mask);
625}
c017a39f 626template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 627{
628 gather(array, member1, member2, indexes);
629}
c017a39f 630template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
f22341db 631 : d(HT::zero())
632{
633 gather(array, member1, member2, indexes, mask);
634}
c017a39f 635template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 636{
637 gather(array, ptrMember1, outerIndexes, innerIndexes);
638}
c017a39f 639template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask)
f22341db 640 : d(HT::zero())
641{
642 gather(array, ptrMember1, outerIndexes, innerIndexes, mask);
643}
644
645template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} };
646template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size>
647{
648 static void check() {
649 VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries);
650 }
651};
c017a39f 652template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 653{
654 IndexSizeChecker<Index, Size>::check();
655 d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
656}
c017a39f 657template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 658{
659 IndexSizeChecker<Index, Size>::check();
660 d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
661 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
662}
c017a39f 663template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 664{
665 IndexSizeChecker<Index, Size>::check();
666 d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
667 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
668}
c017a39f 669template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 670{
671 IndexSizeChecker<Index, Size>::check();
672 d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
673 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
674}
c017a39f 675template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 676{
677 IndexSizeChecker<Index, Size>::check();
678 d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
679 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
680}
c017a39f 681template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 682{
683 IndexSizeChecker<Index, Size>::check();
684 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
685 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
686}
c017a39f 687template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 688{
689 IndexSizeChecker<Index, Size>::check();
690 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
691 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
692}
693
694#ifdef VC_USE_SET_GATHERS
c017a39f 695template<typename T> template<typename IT> Vc_ALWAYS_INLINE void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IT>) indexes, MaskArg mask)
f22341db 696{
697 IndexSizeChecker<Vector<IT>, Size>::check();
c017a39f 698 Vector<IT> indexesTmp = indexes;
699 indexesTmp.setZero(!mask);
700 (*this)(mask) = Vector<T>(mem, indexesTmp);
f22341db 701}
702#endif
703
704#ifdef VC_USE_BSF_GATHERS
705#define VC_MASKED_GATHER \
706 int bits = mask.toInt(); \
707 while (bits) { \
708 const int i = _bit_scan_forward(bits); \
709 bits &= ~(1 << i); /* btr? */ \
710 d.m(i) = ith_value(i); \
711 }
712#elif defined(VC_USE_POPCNT_BSF_GATHERS)
713#define VC_MASKED_GATHER \
714 unsigned int bits = mask.toInt(); \
715 unsigned int low, high = 0; \
716 switch (_mm_popcnt_u32(bits)) { \
717 case 8: \
718 high = _bit_scan_reverse(bits); \
719 d.m(high) = ith_value(high); \
720 high = (1 << high); \
721 case 7: \
722 low = _bit_scan_forward(bits); \
723 bits ^= high | (1 << low); \
724 d.m(low) = ith_value(low); \
725 case 6: \
726 high = _bit_scan_reverse(bits); \
727 d.m(high) = ith_value(high); \
728 high = (1 << high); \
729 case 5: \
730 low = _bit_scan_forward(bits); \
731 bits ^= high | (1 << low); \
732 d.m(low) = ith_value(low); \
733 case 4: \
734 high = _bit_scan_reverse(bits); \
735 d.m(high) = ith_value(high); \
736 high = (1 << high); \
737 case 3: \
738 low = _bit_scan_forward(bits); \
739 bits ^= high | (1 << low); \
740 d.m(low) = ith_value(low); \
741 case 2: \
742 high = _bit_scan_reverse(bits); \
743 d.m(high) = ith_value(high); \
744 case 1: \
745 low = _bit_scan_forward(bits); \
746 d.m(low) = ith_value(low); \
747 case 0: \
748 break; \
749 }
750#else
751#define VC_MASKED_GATHER \
752 if (mask.isEmpty()) { \
753 return; \
754 } \
755 for_all_vector_entries(i, \
756 if (mask[i]) d.m(i) = ith_value(i); \
757 );
758#endif
759
760template<typename T> template<typename Index>
c017a39f 761Vc_INTRINSIC void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask)
f22341db 762{
763 IndexSizeChecker<Index, Size>::check();
764#define ith_value(_i_) (mem[indexes[_i_]])
765 VC_MASKED_GATHER
766#undef ith_value
767}
768
769template<> template<typename S1, typename IT>
c017a39f 770Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 771{
772 IndexSizeChecker<IT, Size>::check();
773 d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1),
774 array[indexes[2]].*(member1), array[indexes[3]].*(member1));
775}
776template<> template<typename S1, typename IT>
c017a39f 777Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 778{
779 IndexSizeChecker<IT, Size>::check();
780 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
781 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
782 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
783}
784template<> template<typename S1, typename IT>
c017a39f 785Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 786{
787 IndexSizeChecker<IT, Size>::check();
788 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
789 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
790 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
791}
792template<> template<typename S1, typename IT>
c017a39f 793Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 794{
795 IndexSizeChecker<IT, Size>::check();
796 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
797 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
798 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
799}
800template<> template<typename S1, typename IT>
c017a39f 801Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 802{
803 IndexSizeChecker<IT, Size>::check();
804 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
805 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
806 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
807}
808template<> template<typename S1, typename IT>
c017a39f 809Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 810{
811 IndexSizeChecker<IT, Size>::check();
812 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
813 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
814 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
815}
816template<> template<typename S1, typename IT>
c017a39f 817Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 818{
819 IndexSizeChecker<IT, Size>::check();
820 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
821 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
822 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
823}
824template<typename T> template<typename S1, typename IT>
c017a39f 825Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
f22341db 826{
827 IndexSizeChecker<IT, Size>::check();
828#define ith_value(_i_) (array[indexes[_i_]].*(member1))
829 VC_MASKED_GATHER
830#undef ith_value
831}
832template<> template<typename S1, typename S2, typename IT>
c017a39f 833Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 834{
835 IndexSizeChecker<IT, Size>::check();
836 d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
837 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
838}
839template<> template<typename S1, typename S2, typename IT>
c017a39f 840Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 841{
842 IndexSizeChecker<IT, Size>::check();
843 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
844 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
845 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
846}
847template<> template<typename S1, typename S2, typename IT>
c017a39f 848Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 849{
850 IndexSizeChecker<IT, Size>::check();
851 d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
852 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
853 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
854}
855template<> template<typename S1, typename S2, typename IT>
c017a39f 856Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 857{
858 IndexSizeChecker<IT, Size>::check();
859 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
860 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
861 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
862}
863template<> template<typename S1, typename S2, typename IT>
c017a39f 864Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 865{
866 IndexSizeChecker<IT, Size>::check();
867 d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
868 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
869 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
870}
871template<> template<typename S1, typename S2, typename IT>
c017a39f 872Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 873{
874 IndexSizeChecker<IT, Size>::check();
875 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
876 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
877 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
878}
879template<> template<typename S1, typename S2, typename IT>
c017a39f 880Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 881{
882 IndexSizeChecker<IT, Size>::check();
883 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
884 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
885 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
886}
887template<typename T> template<typename S1, typename S2, typename IT>
c017a39f 888Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
f22341db 889{
890 IndexSizeChecker<IT, Size>::check();
891#define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2))
892 VC_MASKED_GATHER
893#undef ith_value
894}
895template<> template<typename S1, typename IT1, typename IT2>
c017a39f 896Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 897{
898 IndexSizeChecker<IT1, Size>::check();
899 IndexSizeChecker<IT2, Size>::check();
900 d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
901 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
902}
903template<> template<typename S1, typename IT1, typename IT2>
c017a39f 904Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 905{
906 IndexSizeChecker<IT1, Size>::check();
907 IndexSizeChecker<IT2, Size>::check();
908 d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
909 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
910 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
911 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
912}
913template<> template<typename S1, typename IT1, typename IT2>
c017a39f 914Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 915{
916 IndexSizeChecker<IT1, Size>::check();
917 IndexSizeChecker<IT2, Size>::check();
918 d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
919 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
920 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
921 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
922}
923template<> template<typename S1, typename IT1, typename IT2>
c017a39f 924Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 925{
926 IndexSizeChecker<IT1, Size>::check();
927 IndexSizeChecker<IT2, Size>::check();
928 d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
929 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
930 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
931 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
932}
933template<> template<typename S1, typename IT1, typename IT2>
c017a39f 934Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 935{
936 IndexSizeChecker<IT1, Size>::check();
937 IndexSizeChecker<IT2, Size>::check();
938 d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
939 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
940 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
941 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
942}
943template<> template<typename S1, typename IT1, typename IT2>
c017a39f 944Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 945{
946 IndexSizeChecker<IT1, Size>::check();
947 IndexSizeChecker<IT2, Size>::check();
948 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
949 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
950 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
951 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
952}
953template<> template<typename S1, typename IT1, typename IT2>
c017a39f 954Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 955{
956 IndexSizeChecker<IT1, Size>::check();
957 IndexSizeChecker<IT2, Size>::check();
958 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
959 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
960 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
961 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
962}
963template<typename T> template<typename S1, typename IT1, typename IT2>
c017a39f 964Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask)
f22341db 965{
966 IndexSizeChecker<IT1, Size>::check();
967 IndexSizeChecker<IT2, Size>::check();
968#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
969 VC_MASKED_GATHER
970#undef ith_value
971}
972
973#undef VC_MASKED_GATHER
974#ifdef VC_USE_BSF_SCATTERS
975#define VC_MASKED_SCATTER \
976 int bits = mask.toInt(); \
977 while (bits) { \
978 const int i = _bit_scan_forward(bits); \
979 bits ^= (1 << i); /* btr? */ \
980 ith_value(i) = d.m(i); \
981 }
982#elif defined(VC_USE_POPCNT_BSF_SCATTERS)
983#define VC_MASKED_SCATTER \
984 unsigned int bits = mask.toInt(); \
985 unsigned int low, high = 0; \
986 switch (_mm_popcnt_u32(bits)) { \
987 case 8: \
988 high = _bit_scan_reverse(bits); \
989 ith_value(high) = d.m(high); \
990 high = (1 << high); \
991 case 7: \
992 low = _bit_scan_forward(bits); \
993 bits ^= high | (1 << low); \
994 ith_value(low) = d.m(low); \
995 case 6: \
996 high = _bit_scan_reverse(bits); \
997 ith_value(high) = d.m(high); \
998 high = (1 << high); \
999 case 5: \
1000 low = _bit_scan_forward(bits); \
1001 bits ^= high | (1 << low); \
1002 ith_value(low) = d.m(low); \
1003 case 4: \
1004 high = _bit_scan_reverse(bits); \
1005 ith_value(high) = d.m(high); \
1006 high = (1 << high); \
1007 case 3: \
1008 low = _bit_scan_forward(bits); \
1009 bits ^= high | (1 << low); \
1010 ith_value(low) = d.m(low); \
1011 case 2: \
1012 high = _bit_scan_reverse(bits); \
1013 ith_value(high) = d.m(high); \
1014 case 1: \
1015 low = _bit_scan_forward(bits); \
1016 ith_value(low) = d.m(low); \
1017 case 0: \
1018 break; \
1019 }
1020#else
1021#define VC_MASKED_SCATTER \
1022 if (mask.isEmpty()) { \
1023 return; \
1024 } \
1025 for_all_vector_entries(i, \
1026 if (mask[i]) ith_value(i) = d.m(i); \
1027 );
1028#endif
1029
c017a39f 1030template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const
f22341db 1031{
1032 for_all_vector_entries(i,
1033 mem[indexes[i]] = d.m(i);
1034 );
1035}
c017a39f 1036#if defined(VC_MSVC) && VC_MSVC >= 170000000
1037// MSVC miscompiles the store mem[indexes[1]] = d.m(1) for T = (u)short
1038template<> template<typename Index> Vc_ALWAYS_INLINE void short_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const
1039{
1040 const unsigned int tmp = d.v()._d.m128i_u32[0];
1041 mem[indexes[0]] = tmp & 0xffff;
1042 mem[indexes[1]] = tmp >> 16;
1043 mem[indexes[2]] = _mm_extract_epi16(d.v(), 2);
1044 mem[indexes[3]] = _mm_extract_epi16(d.v(), 3);
1045 mem[indexes[4]] = _mm_extract_epi16(d.v(), 4);
1046 mem[indexes[5]] = _mm_extract_epi16(d.v(), 5);
1047 mem[indexes[6]] = _mm_extract_epi16(d.v(), 6);
1048 mem[indexes[7]] = _mm_extract_epi16(d.v(), 7);
1049}
1050template<> template<typename Index> Vc_ALWAYS_INLINE void ushort_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const
1051{
1052 const unsigned int tmp = d.v()._d.m128i_u32[0];
1053 mem[indexes[0]] = tmp & 0xffff;
1054 mem[indexes[1]] = tmp >> 16;
1055 mem[indexes[2]] = _mm_extract_epi16(d.v(), 2);
1056 mem[indexes[3]] = _mm_extract_epi16(d.v(), 3);
1057 mem[indexes[4]] = _mm_extract_epi16(d.v(), 4);
1058 mem[indexes[5]] = _mm_extract_epi16(d.v(), 5);
1059 mem[indexes[6]] = _mm_extract_epi16(d.v(), 6);
1060 mem[indexes[7]] = _mm_extract_epi16(d.v(), 7);
1061}
1062#endif
1063template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const
f22341db 1064{
1065#define ith_value(_i_) mem[indexes[_i_]]
1066 VC_MASKED_SCATTER
1067#undef ith_value
1068}
c017a39f 1069template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const
f22341db 1070{
1071 for_all_vector_entries(i,
1072 array[indexes[i]].*(member1) = d.m(i);
1073 );
1074}
c017a39f 1075template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const
f22341db 1076{
1077#define ith_value(_i_) array[indexes[_i_]].*(member1)
1078 VC_MASKED_SCATTER
1079#undef ith_value
1080}
c017a39f 1081template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const
f22341db 1082{
1083 for_all_vector_entries(i,
1084 array[indexes[i]].*(member1).*(member2) = d.m(i);
1085 );
1086}
c017a39f 1087template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const
f22341db 1088{
1089#define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2)
1090 VC_MASKED_SCATTER
1091#undef ith_value
1092}
c017a39f 1093template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const
f22341db 1094{
1095 for_all_vector_entries(i,
1096 (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i);
1097 );
1098}
c017a39f 1099template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const
f22341db 1100{
1101#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1102 VC_MASKED_SCATTER
1103#undef ith_value
1104}
1105
1106///////////////////////////////////////////////////////////////////////////////////////////
1107// operator- {{{1
c017a39f 1108template<> Vc_ALWAYS_INLINE Vector<double> Vc_PURE Vc_FLATTEN Vector<double>::operator-() const
f22341db 1109{
1110 return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd());
1111}
c017a39f 1112template<> Vc_ALWAYS_INLINE Vector<float> Vc_PURE Vc_FLATTEN Vector<float>::operator-() const
f22341db 1113{
1114 return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps());
1115}
c017a39f 1116template<> Vc_ALWAYS_INLINE Vector<sfloat> Vc_PURE Vc_FLATTEN Vector<sfloat>::operator-() const
f22341db 1117{
1118 return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps());
1119}
c017a39f 1120template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<int>::operator-() const
f22341db 1121{
1122 return _mm256_sign_epi32(d.v(), _mm256_setallone_si256());
1123}
c017a39f 1124template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<unsigned int>::operator-() const
f22341db 1125{
1126 return _mm256_sign_epi32(d.v(), _mm256_setallone_si256());
1127}
c017a39f 1128template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<short>::operator-() const
f22341db 1129{
1130 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
1131}
c017a39f 1132template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<unsigned short>::operator-() const
f22341db 1133{
1134 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
1135}
1136
1137///////////////////////////////////////////////////////////////////////////////////////////
1138// horizontal ops {{{1
c017a39f 1139template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const
f22341db 1140{
1141 Vector<T> tmp = std::numeric_limits<Vector<T> >::max();
1142 tmp(m) = *this;
1143 return tmp.min();
1144}
c017a39f 1145template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const
f22341db 1146{
1147 Vector<T> tmp = std::numeric_limits<Vector<T> >::min();
1148 tmp(m) = *this;
1149 return tmp.max();
1150}
c017a39f 1151template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const
f22341db 1152{
1153 Vector<T> tmp(VectorSpecialInitializerOne::One);
1154 tmp(m) = *this;
1155 return tmp.product();
1156}
c017a39f 1157template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const
f22341db 1158{
1159 Vector<T> tmp(VectorSpecialInitializerZero::Zero);
1160 tmp(m) = *this;
1161 return tmp.sum();
1162}//}}}
1163// copySign {{{1
c017a39f 1164template<> Vc_INTRINSIC Vector<float> Vector<float>::copySign(Vector<float>::AsArg reference) const
f22341db 1165{
1166 return _mm256_or_ps(
1167 _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()),
1168 _mm256_and_ps(d.v(), _mm256_setabsmask_ps())
1169 );
1170}
c017a39f 1171template<> Vc_INTRINSIC Vector<sfloat> Vector<sfloat>::copySign(Vector<sfloat>::AsArg reference) const
f22341db 1172{
1173 return _mm256_or_ps(
1174 _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()),
1175 _mm256_and_ps(d.v(), _mm256_setabsmask_ps())
1176 );
1177}
c017a39f 1178template<> Vc_INTRINSIC Vector<double> Vector<double>::copySign(Vector<double>::AsArg reference) const
f22341db 1179{
1180 return _mm256_or_pd(
1181 _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()),
1182 _mm256_and_pd(d.v(), _mm256_setabsmask_pd())
1183 );
1184}//}}}1
1185// exponent {{{1
c017a39f 1186template<> Vc_INTRINSIC Vector<float> Vector<float>::exponent() const
f22341db 1187{
c017a39f 1188 VC_ASSERT((*this >= 0.f).isFull());
1189 return Internal::exponent(d.v());
f22341db 1190}
c017a39f 1191template<> Vc_INTRINSIC Vector<sfloat> Vector<sfloat>::exponent() const
f22341db 1192{
c017a39f 1193 VC_ASSERT((*this >= 0.f).isFull());
1194 return Internal::exponent(d.v());
f22341db 1195}
c017a39f 1196template<> Vc_INTRINSIC Vector<double> Vector<double>::exponent() const
f22341db 1197{
c017a39f 1198 VC_ASSERT((*this >= 0.).isFull());
1199 return Internal::exponent(d.v());
f22341db 1200}
1201// }}}1
1202// Random {{{1
c017a39f 1203static Vc_ALWAYS_INLINE void _doRandomStep(Vector<unsigned int> &state0,
f22341db 1204 Vector<unsigned int> &state1)
1205{
1206 state0.load(&Vc::RandomState[0]);
1207 state1.load(&Vc::RandomState[uint_v::Size]);
1208 (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]);
1209 uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]);
1210}
1211
c017a39f 1212template<typename T> Vc_ALWAYS_INLINE Vector<T> Vector<T>::Random()
f22341db 1213{
1214 Vector<unsigned int> state0, state1;
1215 _doRandomStep(state0, state1);
1216 return state0.reinterpretCast<Vector<T> >();
1217}
1218
c017a39f 1219template<> Vc_ALWAYS_INLINE Vector<float> Vector<float>::Random()
f22341db 1220{
1221 Vector<unsigned int> state0, state1;
1222 _doRandomStep(state0, state1);
1223 return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1224}
1225
c017a39f 1226template<> Vc_ALWAYS_INLINE Vector<sfloat> Vector<sfloat>::Random()
f22341db 1227{
1228 Vector<unsigned int> state0, state1;
1229 _doRandomStep(state0, state1);
1230 return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1231}
1232
c017a39f 1233template<> Vc_ALWAYS_INLINE Vector<double> Vector<double>::Random()
f22341db 1234{
c017a39f 1235 const m256i state = VectorHelper<m256i>::load(&Vc::RandomState[0], Vc::Aligned);
f22341db 1236 for (size_t k = 0; k < 8; k += 2) {
c017a39f 1237 typedef unsigned long long uint64 Vc_MAY_ALIAS;
f22341db 1238 const uint64 stateX = *reinterpret_cast<const uint64 *>(&Vc::RandomState[k]);
1239 *reinterpret_cast<uint64 *>(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11);
1240 }
1241 return (Vector<double>(_cast(_mm256_srli_epi64(state, 12))) | One()) - One();
1242}
1243// }}}1
c017a39f 1244// shifted / rotated {{{1
1245template<size_t SIMDWidth, size_t Size, typename VectorType, typename EntryType> struct VectorShift;
1246template<> struct VectorShift<32, 4, m256d, double>
1247{
1248 static Vc_INTRINSIC m256d shifted(param256d v, int amount)
1249 {
1250 switch (amount) {
1251 case 0: return v;
1252 case 1: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 1 * sizeof(double)));
1253 case 2: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 2 * sizeof(double)));
1254 case 3: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 3 * sizeof(double)));
1255 case -1: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 1 * sizeof(double)));
1256 case -2: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 2 * sizeof(double)));
1257 case -3: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 3 * sizeof(double)));
1258 }
1259 return _mm256_setzero_pd();
1260 }
1261};
1262template<typename VectorType, typename EntryType> struct VectorShift<32, 8, VectorType, EntryType>
1263{
1264 typedef typename SseVectorType<VectorType>::Type SmallV;
1265 static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1266 {
1267 switch (amount) {
1268 case 0: return v;
1269 case 1: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 1 * sizeof(EntryType)));
1270 case 2: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 2 * sizeof(EntryType)));
1271 case 3: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 3 * sizeof(EntryType)));
1272 case 4: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 4 * sizeof(EntryType)));
1273 case 5: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 5 * sizeof(EntryType)));
1274 case 6: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 6 * sizeof(EntryType)));
1275 case 7: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 7 * sizeof(EntryType)));
1276 case -1: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 1 * sizeof(EntryType)));
1277 case -2: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 2 * sizeof(EntryType)));
1278 case -3: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 3 * sizeof(EntryType)));
1279 case -4: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 4 * sizeof(EntryType)));
1280 case -5: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 5 * sizeof(EntryType)));
1281 case -6: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 6 * sizeof(EntryType)));
1282 case -7: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 7 * sizeof(EntryType)));
1283 }
1284 return avx_cast<VectorType>(_mm256_setzero_ps());
1285 }
1286};
1287template<typename VectorType, typename EntryType> struct VectorShift<16, 8, VectorType, EntryType>
1288{
79c86c14 1289 enum {
1290 EntryTypeSizeof = sizeof(EntryType)
1291 };
c017a39f 1292 static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1293 {
1294 switch (amount) {
1295 case 0: return v;
79c86c14 1296 case 1: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 1 * EntryTypeSizeof));
1297 case 2: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 2 * EntryTypeSizeof));
1298 case 3: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 3 * EntryTypeSizeof));
1299 case 4: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 4 * EntryTypeSizeof));
1300 case 5: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 5 * EntryTypeSizeof));
1301 case 6: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 6 * EntryTypeSizeof));
1302 case 7: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 7 * EntryTypeSizeof));
1303 case -1: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 1 * EntryTypeSizeof));
1304 case -2: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 2 * EntryTypeSizeof));
1305 case -3: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 3 * EntryTypeSizeof));
1306 case -4: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 4 * EntryTypeSizeof));
1307 case -5: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 5 * EntryTypeSizeof));
1308 case -6: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 6 * EntryTypeSizeof));
1309 case -7: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 7 * EntryTypeSizeof));
c017a39f 1310 }
1311 return _mm_setzero_si128();
1312 }
1313};
1314template<typename T> Vc_INTRINSIC Vector<T> Vector<T>::shifted(int amount) const
1315{
1316 return VectorShift<sizeof(VectorType), Size, VectorType, EntryType>::shifted(d.v(), amount);
1317}
1318template<size_t SIMDWidth, size_t Size, typename VectorType, typename EntryType> struct VectorRotate;
1319template<typename VectorType, typename EntryType> struct VectorRotate<32, 4, VectorType, EntryType>
1320{
1321 typedef typename SseVectorType<VectorType>::Type SmallV;
79c86c14 1322 enum {
1323 EntryTypeSizeof = sizeof(EntryType)
1324 };
c017a39f 1325 static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1326 {
1327 const m128i vLo = avx_cast<m128i>(lo128(v));
1328 const m128i vHi = avx_cast<m128i>(hi128(v));
1329 switch (static_cast<unsigned int>(amount) % 4) {
1330 case 0: return v;
79c86c14 1331 case 1: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)));
c017a39f 1332 case 2: return Mem::permute128<X1, X0>(v);
79c86c14 1333 case 3: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)));
c017a39f 1334 }
1335 return _mm256_setzero_pd();
1336 }
1337};
1338template<typename VectorType, typename EntryType> struct VectorRotate<32, 8, VectorType, EntryType>
1339{
1340 typedef typename SseVectorType<VectorType>::Type SmallV;
79c86c14 1341 enum {
1342 EntryTypeSizeof = sizeof(EntryType)
1343 };
c017a39f 1344 static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1345 {
1346 const m128i vLo = avx_cast<m128i>(lo128(v));
1347 const m128i vHi = avx_cast<m128i>(hi128(v));
1348 switch (static_cast<unsigned int>(amount) % 8) {
1349 case 0: return v;
79c86c14 1350 case 1: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)));
1351 case 2: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof)));
1352 case 3: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof)));
c017a39f 1353 case 4: return Mem::permute128<X1, X0>(v);
79c86c14 1354 case 5: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)));
1355 case 6: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof)));
1356 case 7: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof)));
c017a39f 1357 }
1358 return avx_cast<VectorType>(_mm256_setzero_ps());
1359 }
1360};
1361template<typename VectorType, typename EntryType> struct VectorRotate<16, 8, VectorType, EntryType>
1362{
79c86c14 1363 enum {
1364 EntryTypeSizeof = sizeof(EntryType)
1365 };
c017a39f 1366 static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount)
1367 {
1368 switch (static_cast<unsigned int>(amount) % 8) {
1369 case 0: return v;
79c86c14 1370 case 1: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 1 * EntryTypeSizeof));
1371 case 2: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 2 * EntryTypeSizeof));
1372 case 3: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 3 * EntryTypeSizeof));
1373 case 4: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 4 * EntryTypeSizeof));
1374 case 5: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 5 * EntryTypeSizeof));
1375 case 6: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 6 * EntryTypeSizeof));
1376 case 7: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 7 * EntryTypeSizeof));
c017a39f 1377 }
1378 return _mm_setzero_si128();
1379 }
1380};
1381template<typename T> Vc_INTRINSIC Vector<T> Vector<T>::rotated(int amount) const
1382{
1383 return VectorRotate<sizeof(VectorType), Size, VectorType, EntryType>::rotated(d.v(), amount);
1384 /*
1385 const m128i v0 = avx_cast<m128i>(d.v()[0]);
1386 const m128i v1 = avx_cast<m128i>(d.v()[1]);
1387 switch (static_cast<unsigned int>(amount) % Size) {
1388 case 0: return *this;
1389 case 1: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))));
1390 case 2: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))));
1391 case 3: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))));
1392 case 4: return concat(d.v()[1], d.v()[0]);
1393 case 5: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))));
1394 case 6: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))));
1395 case 7: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))));
1396 }
1397 */
1398}
1399// }}}1
f22341db 1400} // namespace AVX
1401} // namespace Vc
c017a39f 1402} // namespace AliRoot
f22341db 1403
1404#include "undomacros.h"
1405
1406// vim: foldmethod=marker