]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #include "limits.h" | |
21 | #include "const.h" | |
22 | #include "macros.h" | |
23 | ||
c017a39f | 24 | namespace AliRoot { |
f22341db | 25 | namespace Vc |
26 | { | |
27 | ALIGN(64) extern unsigned int RandomState[16]; | |
28 | ||
29 | namespace AVX | |
30 | { | |
31 | ||
32 | /////////////////////////////////////////////////////////////////////////////////////////// | |
33 | // constants {{{1 | |
c017a39f | 34 | template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {} |
35 | template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {} | |
36 | template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) | |
f22341db | 37 | : d(HV::load(IndexesFromZeroData<T>::address(), Aligned)) {} |
38 | ||
c017a39f | 39 | template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::Zero() { return HT::zero(); } |
40 | template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::One() { return HT::one(); } | |
41 | template<typename T> Vc_INTRINSIC Vector<T> Vc_CONST Vector<T>::IndexesFromZero() { return HV::load(IndexesFromZeroData<T>::address(), Aligned); } | |
f22341db | 42 | |
c017a39f | 43 | template<typename T> template<typename T2> Vc_ALWAYS_INLINE Vector<T>::Vector(VC_ALIGNED_PARAMETER(Vector<T2>) x) |
f22341db | 44 | : d(StaticCastHelper<T2, T>::cast(x.data())) {} |
45 | ||
c017a39f | 46 | template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(EntryType x) : d(HT::set(x)) {} |
47 | template<> Vc_ALWAYS_INLINE Vector<double>::Vector(EntryType x) : d(_mm256_set1_pd(x)) {} | |
f22341db | 48 | |
49 | ||
50 | /////////////////////////////////////////////////////////////////////////////////////////// | |
51 | // load ctors {{{1 | |
c017a39f | 52 | template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); } |
53 | template<typename T> template<typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); } | |
54 | template<typename T> template<typename OtherT> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); } | |
55 | template<typename T> template<typename OtherT, typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); } | |
f22341db | 56 | |
57 | /////////////////////////////////////////////////////////////////////////////////////////// | |
58 | // load member functions {{{1 | |
c017a39f | 59 | template<typename T> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem) |
f22341db | 60 | { |
61 | load(mem, Aligned); | |
62 | } | |
63 | ||
c017a39f | 64 | template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem, A align) |
f22341db | 65 | { |
66 | d.v() = HV::load(mem, align); | |
67 | } | |
68 | ||
c017a39f | 69 | template<typename T> template<typename OtherT> Vc_INTRINSIC void Vector<T>::load(const OtherT *mem) |
f22341db | 70 | { |
71 | load(mem, Aligned); | |
72 | } | |
73 | ||
74 | // LoadHelper {{{2 | |
75 | template<typename DstT, typename SrcT, typename Flags> struct LoadHelper; | |
76 | ||
77 | // float {{{2 | |
78 | template<typename Flags> struct LoadHelper<float, double, Flags> { | |
c017a39f | 79 | static m256 load(const double *mem, Flags f) |
f22341db | 80 | { |
c017a39f | 81 | return concat(_mm256_cvtpd_ps(VectorHelper<m256d>::load(&mem[0], f)), |
82 | _mm256_cvtpd_ps(VectorHelper<m256d>::load(&mem[4], f))); | |
f22341db | 83 | } |
84 | }; | |
85 | template<typename Flags> struct LoadHelper<float, unsigned int, Flags> { | |
c017a39f | 86 | static m256 load(const unsigned int *mem, Flags f) |
f22341db | 87 | { |
c017a39f | 88 | return StaticCastHelper<unsigned int, float>::cast(VectorHelper<m256i>::load(mem, f)); |
f22341db | 89 | } |
90 | }; | |
91 | template<typename Flags> struct LoadHelper<float, int, Flags> { | |
c017a39f | 92 | static m256 load(const int *mem, Flags f) |
f22341db | 93 | { |
c017a39f | 94 | return StaticCastHelper<int, float>::cast(VectorHelper<m256i>::load(mem, f)); |
f22341db | 95 | } |
96 | }; | |
97 | template<typename Flags> struct LoadHelper<float, unsigned short, Flags> { | |
c017a39f | 98 | static m256 load(const unsigned short *mem, Flags f) |
f22341db | 99 | { |
c017a39f | 100 | return StaticCastHelper<unsigned short, float>::cast(VectorHelper<m128i>::load(mem, f)); |
f22341db | 101 | } |
102 | }; | |
103 | template<typename Flags> struct LoadHelper<float, short, Flags> { | |
c017a39f | 104 | static m256 load(const short *mem, Flags f) |
f22341db | 105 | { |
c017a39f | 106 | return StaticCastHelper<short, float>::cast(VectorHelper<m128i>::load(mem, f)); |
f22341db | 107 | } |
108 | }; | |
109 | template<typename Flags> struct LoadHelper<float, unsigned char, Flags> { | |
c017a39f | 110 | static m256 load(const unsigned char *mem, Flags f) |
f22341db | 111 | { |
112 | return StaticCastHelper<unsigned int, float>::cast(LoadHelper<unsigned int, unsigned char, Flags>::load(mem, f)); | |
113 | } | |
114 | }; | |
115 | template<typename Flags> struct LoadHelper<float, signed char, Flags> { | |
c017a39f | 116 | static m256 load(const signed char *mem, Flags f) |
f22341db | 117 | { |
118 | return StaticCastHelper<int, float>::cast(LoadHelper<int, signed char, Flags>::load(mem, f)); | |
119 | } | |
120 | }; | |
121 | ||
122 | template<typename SrcT, typename Flags> struct LoadHelper<sfloat, SrcT, Flags> : public LoadHelper<float, SrcT, Flags> {}; | |
123 | ||
124 | // int {{{2 | |
125 | template<typename Flags> struct LoadHelper<int, unsigned int, Flags> { | |
c017a39f | 126 | static m256i load(const unsigned int *mem, Flags f) |
f22341db | 127 | { |
c017a39f | 128 | return VectorHelper<m256i>::load(mem, f); |
f22341db | 129 | } |
130 | }; | |
131 | template<typename Flags> struct LoadHelper<int, unsigned short, Flags> { | |
c017a39f | 132 | static m256i load(const unsigned short *mem, Flags f) |
f22341db | 133 | { |
c017a39f | 134 | return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<m128i>::load(mem, f)); |
f22341db | 135 | } |
136 | }; | |
137 | template<typename Flags> struct LoadHelper<int, short, Flags> { | |
c017a39f | 138 | static m256i load(const short *mem, Flags f) |
f22341db | 139 | { |
c017a39f | 140 | return StaticCastHelper<short, int>::cast(VectorHelper<m128i>::load(mem, f)); |
f22341db | 141 | } |
142 | }; | |
143 | template<typename Flags> struct LoadHelper<int, unsigned char, Flags> { | |
c017a39f | 144 | static m256i load(const unsigned char *mem, Flags) |
f22341db | 145 | { |
146 | // the only available streaming load loads 16 bytes - twice as much as we need => can't use | |
147 | // it, or we risk an out-of-bounds read and an unaligned load exception | |
c017a39f | 148 | const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)); |
149 | const m128i epu16 = _mm_cvtepu8_epi16(epu8); | |
f22341db | 150 | return StaticCastHelper<unsigned short, unsigned int>::cast(epu16); |
151 | } | |
152 | }; | |
153 | template<typename Flags> struct LoadHelper<int, signed char, Flags> { | |
c017a39f | 154 | static m256i load(const signed char *mem, Flags) |
f22341db | 155 | { |
156 | // the only available streaming load loads 16 bytes - twice as much as we need => can't use | |
157 | // it, or we risk an out-of-bounds read and an unaligned load exception | |
c017a39f | 158 | const m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)); |
159 | const m128i epi16 = _mm_cvtepi8_epi16(epi8); | |
f22341db | 160 | return StaticCastHelper<short, int>::cast(epi16); |
161 | } | |
162 | }; | |
163 | ||
164 | // unsigned int {{{2 | |
165 | template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> { | |
c017a39f | 166 | static m256i load(const unsigned short *mem, Flags f) |
f22341db | 167 | { |
c017a39f | 168 | return StaticCastHelper<unsigned short, unsigned int>::cast(VectorHelper<m128i>::load(mem, f)); |
f22341db | 169 | } |
170 | }; | |
171 | template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> { | |
c017a39f | 172 | static m256i load(const unsigned char *mem, Flags) |
f22341db | 173 | { |
174 | // the only available streaming load loads 16 bytes - twice as much as we need => can't use | |
175 | // it, or we risk an out-of-bounds read and an unaligned load exception | |
c017a39f | 176 | const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)); |
177 | const m128i epu16 = _mm_cvtepu8_epi16(epu8); | |
f22341db | 178 | return StaticCastHelper<unsigned short, unsigned int>::cast(epu16); |
179 | } | |
180 | }; | |
181 | ||
182 | // short {{{2 | |
183 | template<typename Flags> struct LoadHelper<short, unsigned short, Flags> { | |
c017a39f | 184 | static m128i load(const unsigned short *mem, Flags f) |
f22341db | 185 | { |
c017a39f | 186 | return StaticCastHelper<unsigned short, short>::cast(VectorHelper<m128i>::load(mem, f)); |
f22341db | 187 | } |
188 | }; | |
189 | template<typename Flags> struct LoadHelper<short, unsigned char, Flags> { | |
c017a39f | 190 | static m128i load(const unsigned char *mem, Flags) |
f22341db | 191 | { |
192 | // the only available streaming load loads 16 bytes - twice as much as we need => can't use | |
193 | // it, or we risk an out-of-bounds read and an unaligned load exception | |
c017a39f | 194 | const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)); |
f22341db | 195 | return _mm_cvtepu8_epi16(epu8); |
196 | } | |
197 | }; | |
198 | template<typename Flags> struct LoadHelper<short, signed char, Flags> { | |
c017a39f | 199 | static m128i load(const signed char *mem, Flags) |
f22341db | 200 | { |
201 | // the only available streaming load loads 16 bytes - twice as much as we need => can't use | |
202 | // it, or we risk an out-of-bounds read and an unaligned load exception | |
c017a39f | 203 | const m128i epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)); |
f22341db | 204 | return _mm_cvtepi8_epi16(epi8); |
205 | } | |
206 | }; | |
207 | ||
208 | // unsigned short {{{2 | |
209 | template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> { | |
c017a39f | 210 | static m128i load(const unsigned char *mem, Flags) |
f22341db | 211 | { |
212 | // the only available streaming load loads 16 bytes - twice as much as we need => can't use | |
213 | // it, or we risk an out-of-bounds read and an unaligned load exception | |
c017a39f | 214 | const m128i epu8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)); |
f22341db | 215 | return _mm_cvtepu8_epi16(epu8); |
216 | } | |
217 | }; | |
218 | ||
219 | // general load, implemented via LoadHelper {{{2 | |
c017a39f | 220 | template<typename DstT> template<typename SrcT, typename Flags> Vc_INTRINSIC void Vector<DstT>::load(const SrcT *x, Flags f) |
f22341db | 221 | { |
222 | d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f); | |
223 | } | |
224 | ||
225 | /////////////////////////////////////////////////////////////////////////////////////////// | |
226 | // zeroing {{{1 | |
c017a39f | 227 | template<typename T> Vc_INTRINSIC void Vector<T>::setZero() |
f22341db | 228 | { |
229 | data() = HV::zero(); | |
230 | } | |
c017a39f | 231 | template<typename T> Vc_INTRINSIC void Vector<T>::setZero(const Mask &k) |
f22341db | 232 | { |
233 | data() = HV::andnot_(avx_cast<VectorType>(k.data()), data()); | |
234 | } | |
235 | ||
c017a39f | 236 | template<> Vc_INTRINSIC void Vector<double>::setQnan() |
f22341db | 237 | { |
238 | data() = _mm256_setallone_pd(); | |
239 | } | |
c017a39f | 240 | template<> Vc_INTRINSIC void Vector<double>::setQnan(MaskArg k) |
f22341db | 241 | { |
242 | data() = _mm256_or_pd(data(), k.dataD()); | |
243 | } | |
c017a39f | 244 | template<> Vc_INTRINSIC void Vector<float>::setQnan() |
f22341db | 245 | { |
246 | data() = _mm256_setallone_ps(); | |
247 | } | |
c017a39f | 248 | template<> Vc_INTRINSIC void Vector<float>::setQnan(MaskArg k) |
f22341db | 249 | { |
250 | data() = _mm256_or_ps(data(), k.data()); | |
251 | } | |
c017a39f | 252 | template<> Vc_INTRINSIC void Vector<sfloat>::setQnan() |
f22341db | 253 | { |
254 | data() = _mm256_setallone_ps(); | |
255 | } | |
c017a39f | 256 | template<> Vc_INTRINSIC void Vector<sfloat>::setQnan(MaskArg k) |
f22341db | 257 | { |
258 | data() = _mm256_or_ps(data(), k.data()); | |
259 | } | |
260 | ||
261 | /////////////////////////////////////////////////////////////////////////////////////////// | |
262 | // stores {{{1 | |
c017a39f | 263 | template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem) const |
f22341db | 264 | { |
265 | HV::store(mem, data(), Aligned); | |
266 | } | |
c017a39f | 267 | template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask) const |
f22341db | 268 | { |
269 | HV::store(mem, data(), avx_cast<VectorType>(mask.data()), Aligned); | |
270 | } | |
c017a39f | 271 | template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, A align) const |
f22341db | 272 | { |
273 | HV::store(mem, data(), align); | |
274 | } | |
c017a39f | 275 | template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask, A align) const |
f22341db | 276 | { |
277 | HV::store(mem, data(), avx_cast<VectorType>(mask.data()), align); | |
278 | } | |
279 | ||
280 | /////////////////////////////////////////////////////////////////////////////////////////// | |
281 | // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1 | |
c017a39f | 282 | template<typename T> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<T>::Vector(const Vector<typename HT::ConcatType> *a) |
f22341db | 283 | : d(a[0]) |
284 | { | |
285 | } | |
c017a39f | 286 | template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<float>::Vector(const Vector<HT::ConcatType> *a) |
f22341db | 287 | : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data()))) |
288 | { | |
289 | } | |
c017a39f | 290 | template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<short>::Vector(const Vector<HT::ConcatType> *a) |
f22341db | 291 | : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data()))) |
292 | { | |
293 | } | |
c017a39f | 294 | template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector<unsigned short>::Vector(const Vector<HT::ConcatType> *a) |
f22341db | 295 | : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data()))) |
296 | { | |
297 | } | |
c017a39f | 298 | template<typename T> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::expand(Vector<typename HT::ConcatType> *x) const |
f22341db | 299 | { |
300 | x[0] = *this; | |
301 | } | |
c017a39f | 302 | template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::expand(Vector<HT::ConcatType> *x) const |
f22341db | 303 | { |
304 | x[0].data() = _mm256_cvtps_pd(lo128(d.v())); | |
305 | x[1].data() = _mm256_cvtps_pd(hi128(d.v())); | |
306 | } | |
c017a39f | 307 | template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::expand(Vector<HT::ConcatType> *x) const |
f22341db | 308 | { |
309 | x[0].data() = concat(_mm_cvtepi16_epi32(d.v()), | |
310 | _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); | |
311 | } | |
c017a39f | 312 | template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::expand(Vector<HT::ConcatType> *x) const |
f22341db | 313 | { |
314 | x[0].data() = concat(_mm_cvtepu16_epi32(d.v()), | |
315 | _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); | |
316 | } | |
317 | ||
318 | /////////////////////////////////////////////////////////////////////////////////////////// | |
319 | // swizzles {{{1 | |
c017a39f | 320 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE &Vector<T>::abcd() const { return *this; } |
321 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); } | |
322 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); } | |
323 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); } | |
324 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); } | |
325 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); } | |
326 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); } | |
327 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); } | |
328 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); } | |
329 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); } | |
330 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); } | |
331 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); } | |
332 | template<typename T> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); } | |
f22341db | 333 | |
c017a39f | 334 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::cdab() const { return Mem::shuffle128<X1, X0>(data(), data()); } |
335 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); } | |
336 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); } | |
337 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); } | |
338 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); } | |
339 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); } | |
340 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bcad() const { return Mem::shuffle<X1, Y0, X2, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); } | |
341 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::bcda() const { return Mem::shuffle<X1, Y0, X3, Y2>(data(), Mem::shuffle128<X1, X0>(data(), data())); } | |
342 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dabc() const { return Mem::shuffle<X1, Y0, X3, Y2>(Mem::shuffle128<X1, X0>(data(), data()), data()); } | |
343 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::acbd() const { return Mem::shuffle<X0, Y0, X3, Y3>(Mem::shuffle128<X0, X0>(data(), data()), Mem::shuffle128<X1, X1>(data(), data())); } | |
344 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dbca() const { return Mem::shuffle<X1, Y1, X2, Y2>(Mem::shuffle128<X1, X1>(data(), data()), Mem::shuffle128<X0, X0>(data(), data())); } | |
345 | template<> Vc_INTRINSIC const double_v Vc_PURE Vector<double>::dcba() const { return cdab().badc(); } | |
f22341db | 346 | |
347 | #define VC_SWIZZLES_16BIT_IMPL(T) \ | |
c017a39f | 348 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \ |
349 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \ | |
350 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \ | |
351 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \ | |
352 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \ | |
353 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \ | |
354 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \ | |
355 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \ | |
356 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \ | |
357 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \ | |
358 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \ | |
359 | template<> Vc_INTRINSIC const Vector<T> Vc_PURE Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); } | |
f22341db | 360 | VC_SWIZZLES_16BIT_IMPL(short) |
361 | VC_SWIZZLES_16BIT_IMPL(unsigned short) | |
362 | #undef VC_SWIZZLES_16BIT_IMPL | |
363 | ||
364 | /////////////////////////////////////////////////////////////////////////////////////////// | |
365 | // division {{{1 | |
366 | template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x) | |
367 | { | |
368 | if (HasVectorDivision) { | |
369 | return operator/=(Vector<T>(x)); | |
370 | } | |
371 | for_all_vector_entries(i, | |
372 | d.m(i) /= x; | |
373 | ); | |
374 | return *this; | |
375 | } | |
c017a39f | 376 | template<typename T> template<typename TT> inline Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const |
f22341db | 377 | { |
378 | if (HasVectorDivision) { | |
379 | return operator/(Vector<T>(x)); | |
380 | } | |
381 | Vector<T> r; | |
382 | for_all_vector_entries(i, | |
383 | r.d.m(i) = d.m(i) / x; | |
384 | ); | |
385 | return r; | |
386 | } | |
387 | // per default fall back to scalar division | |
388 | template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x) | |
389 | { | |
390 | for_all_vector_entries(i, | |
391 | d.m(i) /= x.d.m(i); | |
392 | ); | |
393 | return *this; | |
394 | } | |
395 | ||
c017a39f | 396 | template<typename T> inline Vector<T> Vc_PURE Vector<T>::operator/(const Vector<T> &x) const |
f22341db | 397 | { |
398 | Vector<T> r; | |
399 | for_all_vector_entries(i, | |
400 | r.d.m(i) = d.m(i) / x.d.m(i); | |
401 | ); | |
402 | return r; | |
403 | } | |
404 | // specialize division on type | |
c017a39f | 405 | static Vc_INTRINSIC m256i Vc_CONST divInt(param256i a, param256i b) { |
406 | const m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); | |
407 | const m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); | |
408 | const m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); | |
409 | const m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); | |
f22341db | 410 | return concat( |
411 | _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), | |
412 | _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)) | |
413 | ); | |
414 | } | |
415 | template<> inline Vector<int> &Vector<int>::operator/=(const Vector<int> &x) | |
416 | { | |
417 | d.v() = divInt(d.v(), x.d.v()); | |
418 | return *this; | |
419 | } | |
c017a39f | 420 | template<> inline Vector<int> Vc_PURE Vector<int>::operator/(const Vector<int> &x) const |
f22341db | 421 | { |
422 | return divInt(d.v(), x.d.v()); | |
423 | } | |
c017a39f | 424 | static inline m256i Vc_CONST divUInt(param256i a, param256i b) { |
425 | m256d loa = _mm256_cvtepi32_pd(lo128(a)); | |
426 | m256d hia = _mm256_cvtepi32_pd(hi128(a)); | |
427 | m256d lob = _mm256_cvtepi32_pd(lo128(b)); | |
428 | m256d hib = _mm256_cvtepi32_pd(hi128(b)); | |
f22341db | 429 | // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32) |
430 | // to get the right number back we have to add 2^32 where a >= 2^31 | |
431 | loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); | |
432 | hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); | |
433 | // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and | |
434 | // we rather want the standard stuff fast | |
435 | // | |
436 | // there is one remaining problem: a >= 2^31 and b == 1 | |
437 | // in that case the return value would be 2^31 | |
c017a39f | 438 | return avx_cast<m256i>(_mm256_blendv_ps(avx_cast<m256>(concat( |
f22341db | 439 | _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), |
440 | _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)) | |
c017a39f | 441 | )), avx_cast<m256>(a), avx_cast<m256>(concat( |
f22341db | 442 | _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()), |
443 | _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32()))))); | |
444 | } | |
c017a39f | 445 | template<> Vc_ALWAYS_INLINE Vector<unsigned int> &Vector<unsigned int>::operator/=(const Vector<unsigned int> &x) |
f22341db | 446 | { |
447 | d.v() = divUInt(d.v(), x.d.v()); | |
448 | return *this; | |
449 | } | |
c017a39f | 450 | template<> Vc_ALWAYS_INLINE Vector<unsigned int> Vc_PURE Vector<unsigned int>::operator/(const Vector<unsigned int> &x) const |
f22341db | 451 | { |
452 | return divUInt(d.v(), x.d.v()); | |
453 | } | |
c017a39f | 454 | template<typename T> static inline m128i Vc_CONST divShort(param128i a, param128i b) |
f22341db | 455 | { |
c017a39f | 456 | const m256 r = _mm256_div_ps(StaticCastHelper<T, float>::cast(a), |
f22341db | 457 | StaticCastHelper<T, float>::cast(b)); |
458 | return StaticCastHelper<float, T>::cast(r); | |
459 | } | |
c017a39f | 460 | template<> Vc_ALWAYS_INLINE Vector<short> &Vector<short>::operator/=(const Vector<short> &x) |
f22341db | 461 | { |
462 | d.v() = divShort<short>(d.v(), x.d.v()); | |
463 | return *this; | |
464 | } | |
c017a39f | 465 | template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vector<short>::operator/(const Vector<short> &x) const |
f22341db | 466 | { |
467 | return divShort<short>(d.v(), x.d.v()); | |
468 | } | |
c017a39f | 469 | template<> Vc_ALWAYS_INLINE Vector<unsigned short> &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x) |
f22341db | 470 | { |
471 | d.v() = divShort<unsigned short>(d.v(), x.d.v()); | |
472 | return *this; | |
473 | } | |
c017a39f | 474 | template<> Vc_ALWAYS_INLINE Vector<unsigned short> Vc_PURE Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const |
f22341db | 475 | { |
476 | return divShort<unsigned short>(d.v(), x.d.v()); | |
477 | } | |
c017a39f | 478 | template<> Vc_INTRINSIC float_v &float_v::operator/=(const float_v &x) |
f22341db | 479 | { |
480 | d.v() = _mm256_div_ps(d.v(), x.d.v()); | |
481 | return *this; | |
482 | } | |
c017a39f | 483 | template<> Vc_INTRINSIC float_v Vc_PURE float_v::operator/(const float_v &x) const |
f22341db | 484 | { |
485 | return _mm256_div_ps(d.v(), x.d.v()); | |
486 | } | |
c017a39f | 487 | template<> Vc_INTRINSIC sfloat_v &sfloat_v::operator/=(const sfloat_v &x) |
488 | { | |
489 | d.v() = _mm256_div_ps(d.v(), x.d.v()); | |
490 | return *this; | |
491 | } | |
492 | template<> Vc_INTRINSIC sfloat_v Vc_PURE sfloat_v::operator/(const sfloat_v &x) const | |
493 | { | |
494 | return _mm256_div_ps(d.v(), x.d.v()); | |
495 | } | |
496 | template<> Vc_INTRINSIC double_v &double_v::operator/=(const double_v &x) | |
f22341db | 497 | { |
498 | d.v() = _mm256_div_pd(d.v(), x.d.v()); | |
499 | return *this; | |
500 | } | |
c017a39f | 501 | template<> Vc_INTRINSIC double_v Vc_PURE double_v::operator/(const double_v &x) const |
f22341db | 502 | { |
503 | return _mm256_div_pd(d.v(), x.d.v()); | |
504 | } | |
505 | ||
506 | /////////////////////////////////////////////////////////////////////////////////////////// | |
507 | // integer ops {{{1 | |
508 | #define OP_IMPL(T, symbol) \ | |
c017a39f | 509 | template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(AsArg x) \ |
f22341db | 510 | { \ |
511 | for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \ | |
512 | return *this; \ | |
513 | } \ | |
c017a39f | 514 | template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator symbol(AsArg x) const \ |
f22341db | 515 | { \ |
516 | Vector<T> r; \ | |
517 | for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \ | |
518 | return r; \ | |
519 | } | |
520 | OP_IMPL(int, <<) | |
521 | OP_IMPL(int, >>) | |
522 | OP_IMPL(unsigned int, <<) | |
523 | OP_IMPL(unsigned int, >>) | |
524 | OP_IMPL(short, <<) | |
525 | OP_IMPL(short, >>) | |
526 | OP_IMPL(unsigned short, <<) | |
527 | OP_IMPL(unsigned short, >>) | |
528 | #undef OP_IMPL | |
529 | ||
c017a39f | 530 | template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator>>=(int shift) { |
f22341db | 531 | d.v() = VectorHelper<T>::shiftRight(d.v(), shift); |
532 | return *static_cast<Vector<T> *>(this); | |
533 | } | |
c017a39f | 534 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator>>(int shift) const { |
f22341db | 535 | return VectorHelper<T>::shiftRight(d.v(), shift); |
536 | } | |
c017a39f | 537 | template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator<<=(int shift) { |
f22341db | 538 | d.v() = VectorHelper<T>::shiftLeft(d.v(), shift); |
539 | return *static_cast<Vector<T> *>(this); | |
540 | } | |
c017a39f | 541 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator<<(int shift) const { |
f22341db | 542 | return VectorHelper<T>::shiftLeft(d.v(), shift); |
543 | } | |
544 | ||
545 | #define OP_IMPL(T, symbol, fun) \ | |
c017a39f | 546 | template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \ |
547 | template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator symbol(AsArg x) const { return Vector<T>(HV::fun(d.v(), x.d.v())); } | |
f22341db | 548 | OP_IMPL(int, &, and_) |
549 | OP_IMPL(int, |, or_) | |
550 | OP_IMPL(int, ^, xor_) | |
551 | OP_IMPL(unsigned int, &, and_) | |
552 | OP_IMPL(unsigned int, |, or_) | |
553 | OP_IMPL(unsigned int, ^, xor_) | |
554 | OP_IMPL(short, &, and_) | |
555 | OP_IMPL(short, |, or_) | |
556 | OP_IMPL(short, ^, xor_) | |
557 | OP_IMPL(unsigned short, &, and_) | |
558 | OP_IMPL(unsigned short, |, or_) | |
559 | OP_IMPL(unsigned short, ^, xor_) | |
560 | OP_IMPL(float, &, and_) | |
561 | OP_IMPL(float, |, or_) | |
562 | OP_IMPL(float, ^, xor_) | |
563 | OP_IMPL(sfloat, &, and_) | |
564 | OP_IMPL(sfloat, |, or_) | |
565 | OP_IMPL(sfloat, ^, xor_) | |
566 | OP_IMPL(double, &, and_) | |
567 | OP_IMPL(double, |, or_) | |
568 | OP_IMPL(double, ^, xor_) | |
569 | #undef OP_IMPL | |
570 | ||
571 | // operators {{{1 | |
572 | #include "../common/operators.h" | |
c017a39f | 573 | // isNegative {{{1 |
574 | template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const | |
575 | { | |
576 | return avx_cast<m256>(_mm256_srai_epi32(avx_cast<m256i>(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); | |
577 | } | |
578 | template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const | |
579 | { | |
580 | return avx_cast<m256>(_mm256_srai_epi32(avx_cast<m256i>(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); | |
581 | } | |
582 | template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const | |
583 | { | |
584 | return Mem::permute<X1, X1, X3, X3>(avx_cast<m256>( | |
585 | _mm256_srai_epi32(avx_cast<m256i>(_mm256_and_pd(_mm256_setsignmask_pd(), d.v())), 31) | |
586 | )); | |
587 | } | |
f22341db | 588 | // gathers {{{1 |
589 | // Better implementation (hopefully) with _mm256_set_ | |
590 | //X template<typename T> template<typename Index> Vector<T>::Vector(const EntryType *mem, const Index *indexes) | |
591 | //X { | |
592 | //X for_all_vector_entries(int i, | |
593 | //X d.m(i) = mem[indexes[i]]; | |
594 | //X ); | |
595 | //X } | |
c017a39f | 596 | template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes) |
f22341db | 597 | { |
598 | gather(mem, indexes); | |
599 | } | |
c017a39f | 600 | template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes) |
f22341db | 601 | { |
602 | gather(mem, indexes); | |
603 | } | |
604 | ||
c017a39f | 605 | template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) |
f22341db | 606 | : d(HT::zero()) |
607 | { | |
608 | gather(mem, indexes, mask); | |
609 | } | |
610 | ||
c017a39f | 611 | template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes, MaskArg mask) |
f22341db | 612 | : d(HT::zero()) |
613 | { | |
614 | gather(mem, indexes, mask); | |
615 | } | |
616 | ||
c017a39f | 617 | template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 618 | { |
619 | gather(array, member1, indexes); | |
620 | } | |
c017a39f | 621 | template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) |
f22341db | 622 | : d(HT::zero()) |
623 | { | |
624 | gather(array, member1, indexes, mask); | |
625 | } | |
c017a39f | 626 | template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 627 | { |
628 | gather(array, member1, member2, indexes); | |
629 | } | |
c017a39f | 630 | template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) |
f22341db | 631 | : d(HT::zero()) |
632 | { | |
633 | gather(array, member1, member2, indexes, mask); | |
634 | } | |
c017a39f | 635 | template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 636 | { |
637 | gather(array, ptrMember1, outerIndexes, innerIndexes); | |
638 | } | |
c017a39f | 639 | template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) |
f22341db | 640 | : d(HT::zero()) |
641 | { | |
642 | gather(array, ptrMember1, outerIndexes, innerIndexes, mask); | |
643 | } | |
644 | ||
645 | template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} }; | |
646 | template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size> | |
647 | { | |
648 | static void check() { | |
649 | VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); | |
650 | } | |
651 | }; | |
c017a39f | 652 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 653 | { |
654 | IndexSizeChecker<Index, Size>::check(); | |
655 | d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); | |
656 | } | |
c017a39f | 657 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 658 | { |
659 | IndexSizeChecker<Index, Size>::check(); | |
660 | d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], | |
661 | mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
662 | } | |
c017a39f | 663 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 664 | { |
665 | IndexSizeChecker<Index, Size>::check(); | |
666 | d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], | |
667 | mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
668 | } | |
c017a39f | 669 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 670 | { |
671 | IndexSizeChecker<Index, Size>::check(); | |
672 | d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], | |
673 | mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
674 | } | |
c017a39f | 675 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 676 | { |
677 | IndexSizeChecker<Index, Size>::check(); | |
678 | d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], | |
679 | mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
680 | } | |
c017a39f | 681 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 682 | { |
683 | IndexSizeChecker<Index, Size>::check(); | |
684 | d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], | |
685 | mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
686 | } | |
c017a39f | 687 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 688 | { |
689 | IndexSizeChecker<Index, Size>::check(); | |
690 | d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], | |
691 | mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
692 | } | |
693 | ||
694 | #ifdef VC_USE_SET_GATHERS | |
c017a39f | 695 | template<typename T> template<typename IT> Vc_ALWAYS_INLINE void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IT>) indexes, MaskArg mask) |
f22341db | 696 | { |
697 | IndexSizeChecker<Vector<IT>, Size>::check(); | |
c017a39f | 698 | Vector<IT> indexesTmp = indexes; |
699 | indexesTmp.setZero(!mask); | |
700 | (*this)(mask) = Vector<T>(mem, indexesTmp); | |
f22341db | 701 | } |
702 | #endif | |
703 | ||
704 | #ifdef VC_USE_BSF_GATHERS | |
705 | #define VC_MASKED_GATHER \ | |
706 | int bits = mask.toInt(); \ | |
707 | while (bits) { \ | |
708 | const int i = _bit_scan_forward(bits); \ | |
709 | bits &= ~(1 << i); /* btr? */ \ | |
710 | d.m(i) = ith_value(i); \ | |
711 | } | |
712 | #elif defined(VC_USE_POPCNT_BSF_GATHERS) | |
713 | #define VC_MASKED_GATHER \ | |
714 | unsigned int bits = mask.toInt(); \ | |
715 | unsigned int low, high = 0; \ | |
716 | switch (_mm_popcnt_u32(bits)) { \ | |
717 | case 8: \ | |
718 | high = _bit_scan_reverse(bits); \ | |
719 | d.m(high) = ith_value(high); \ | |
720 | high = (1 << high); \ | |
721 | case 7: \ | |
722 | low = _bit_scan_forward(bits); \ | |
723 | bits ^= high | (1 << low); \ | |
724 | d.m(low) = ith_value(low); \ | |
725 | case 6: \ | |
726 | high = _bit_scan_reverse(bits); \ | |
727 | d.m(high) = ith_value(high); \ | |
728 | high = (1 << high); \ | |
729 | case 5: \ | |
730 | low = _bit_scan_forward(bits); \ | |
731 | bits ^= high | (1 << low); \ | |
732 | d.m(low) = ith_value(low); \ | |
733 | case 4: \ | |
734 | high = _bit_scan_reverse(bits); \ | |
735 | d.m(high) = ith_value(high); \ | |
736 | high = (1 << high); \ | |
737 | case 3: \ | |
738 | low = _bit_scan_forward(bits); \ | |
739 | bits ^= high | (1 << low); \ | |
740 | d.m(low) = ith_value(low); \ | |
741 | case 2: \ | |
742 | high = _bit_scan_reverse(bits); \ | |
743 | d.m(high) = ith_value(high); \ | |
744 | case 1: \ | |
745 | low = _bit_scan_forward(bits); \ | |
746 | d.m(low) = ith_value(low); \ | |
747 | case 0: \ | |
748 | break; \ | |
749 | } | |
750 | #else | |
751 | #define VC_MASKED_GATHER \ | |
752 | if (mask.isEmpty()) { \ | |
753 | return; \ | |
754 | } \ | |
755 | for_all_vector_entries(i, \ | |
756 | if (mask[i]) d.m(i) = ith_value(i); \ | |
757 | ); | |
758 | #endif | |
759 | ||
760 | template<typename T> template<typename Index> | |
c017a39f | 761 | Vc_INTRINSIC void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) |
f22341db | 762 | { |
763 | IndexSizeChecker<Index, Size>::check(); | |
764 | #define ith_value(_i_) (mem[indexes[_i_]]) | |
765 | VC_MASKED_GATHER | |
766 | #undef ith_value | |
767 | } | |
768 | ||
769 | template<> template<typename S1, typename IT> | |
c017a39f | 770 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 771 | { |
772 | IndexSizeChecker<IT, Size>::check(); | |
773 | d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1), | |
774 | array[indexes[2]].*(member1), array[indexes[3]].*(member1)); | |
775 | } | |
776 | template<> template<typename S1, typename IT> | |
c017a39f | 777 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 778 | { |
779 | IndexSizeChecker<IT, Size>::check(); | |
780 | d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
781 | array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), | |
782 | array[indexes[6]].*(member1), array[indexes[7]].*(member1)); | |
783 | } | |
784 | template<> template<typename S1, typename IT> | |
c017a39f | 785 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 786 | { |
787 | IndexSizeChecker<IT, Size>::check(); | |
788 | d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
789 | array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), | |
790 | array[indexes[6]].*(member1), array[indexes[7]].*(member1)); | |
791 | } | |
792 | template<> template<typename S1, typename IT> | |
c017a39f | 793 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 794 | { |
795 | IndexSizeChecker<IT, Size>::check(); | |
796 | d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
797 | array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), | |
798 | array[indexes[6]].*(member1), array[indexes[7]].*(member1)); | |
799 | } | |
800 | template<> template<typename S1, typename IT> | |
c017a39f | 801 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 802 | { |
803 | IndexSizeChecker<IT, Size>::check(); | |
804 | d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
805 | array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), | |
806 | array[indexes[6]].*(member1), array[indexes[7]].*(member1)); | |
807 | } | |
808 | template<> template<typename S1, typename IT> | |
c017a39f | 809 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 810 | { |
811 | IndexSizeChecker<IT, Size>::check(); | |
812 | d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
813 | array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), | |
814 | array[indexes[6]].*(member1), array[indexes[7]].*(member1)); | |
815 | } | |
816 | template<> template<typename S1, typename IT> | |
c017a39f | 817 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 818 | { |
819 | IndexSizeChecker<IT, Size>::check(); | |
820 | d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
821 | array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), | |
822 | array[indexes[6]].*(member1), array[indexes[7]].*(member1)); | |
823 | } | |
824 | template<typename T> template<typename S1, typename IT> | |
c017a39f | 825 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) |
f22341db | 826 | { |
827 | IndexSizeChecker<IT, Size>::check(); | |
828 | #define ith_value(_i_) (array[indexes[_i_]].*(member1)) | |
829 | VC_MASKED_GATHER | |
830 | #undef ith_value | |
831 | } | |
832 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 833 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 834 | { |
835 | IndexSizeChecker<IT, Size>::check(); | |
836 | d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), | |
837 | array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); | |
838 | } | |
839 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 840 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 841 | { |
842 | IndexSizeChecker<IT, Size>::check(); | |
843 | d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
844 | array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
845 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
846 | } | |
847 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 848 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 849 | { |
850 | IndexSizeChecker<IT, Size>::check(); | |
851 | d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
852 | array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
853 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
854 | } | |
855 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 856 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 857 | { |
858 | IndexSizeChecker<IT, Size>::check(); | |
859 | d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
860 | array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
861 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
862 | } | |
863 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 864 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 865 | { |
866 | IndexSizeChecker<IT, Size>::check(); | |
867 | d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
868 | array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
869 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
870 | } | |
871 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 872 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 873 | { |
874 | IndexSizeChecker<IT, Size>::check(); | |
875 | d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
876 | array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
877 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
878 | } | |
879 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 880 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 881 | { |
882 | IndexSizeChecker<IT, Size>::check(); | |
883 | d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
884 | array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
885 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
886 | } | |
887 | template<typename T> template<typename S1, typename S2, typename IT> | |
c017a39f | 888 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) |
f22341db | 889 | { |
890 | IndexSizeChecker<IT, Size>::check(); | |
891 | #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) | |
892 | VC_MASKED_GATHER | |
893 | #undef ith_value | |
894 | } | |
895 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 896 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 897 | { |
898 | IndexSizeChecker<IT1, Size>::check(); | |
899 | IndexSizeChecker<IT2, Size>::check(); | |
900 | d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
901 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); | |
902 | } | |
903 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 904 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 905 | { |
906 | IndexSizeChecker<IT1, Size>::check(); | |
907 | IndexSizeChecker<IT2, Size>::check(); | |
908 | d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
909 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], | |
910 | (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
911 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
912 | } | |
913 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 914 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<sfloat>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 915 | { |
916 | IndexSizeChecker<IT1, Size>::check(); | |
917 | IndexSizeChecker<IT2, Size>::check(); | |
918 | d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
919 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], | |
920 | (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
921 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
922 | } | |
923 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 924 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 925 | { |
926 | IndexSizeChecker<IT1, Size>::check(); | |
927 | IndexSizeChecker<IT2, Size>::check(); | |
928 | d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
929 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], | |
930 | (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
931 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
932 | } | |
933 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 934 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 935 | { |
936 | IndexSizeChecker<IT1, Size>::check(); | |
937 | IndexSizeChecker<IT2, Size>::check(); | |
938 | d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
939 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], | |
940 | (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
941 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
942 | } | |
943 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 944 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 945 | { |
946 | IndexSizeChecker<IT1, Size>::check(); | |
947 | IndexSizeChecker<IT2, Size>::check(); | |
948 | d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
949 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], | |
950 | (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
951 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
952 | } | |
953 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 954 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 955 | { |
956 | IndexSizeChecker<IT1, Size>::check(); | |
957 | IndexSizeChecker<IT2, Size>::check(); | |
958 | d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
959 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], | |
960 | (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
961 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
962 | } | |
963 | template<typename T> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 964 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) |
f22341db | 965 | { |
966 | IndexSizeChecker<IT1, Size>::check(); | |
967 | IndexSizeChecker<IT2, Size>::check(); | |
968 | #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] | |
969 | VC_MASKED_GATHER | |
970 | #undef ith_value | |
971 | } | |
972 | ||
973 | #undef VC_MASKED_GATHER | |
974 | #ifdef VC_USE_BSF_SCATTERS | |
975 | #define VC_MASKED_SCATTER \ | |
976 | int bits = mask.toInt(); \ | |
977 | while (bits) { \ | |
978 | const int i = _bit_scan_forward(bits); \ | |
979 | bits ^= (1 << i); /* btr? */ \ | |
980 | ith_value(i) = d.m(i); \ | |
981 | } | |
982 | #elif defined(VC_USE_POPCNT_BSF_SCATTERS) | |
983 | #define VC_MASKED_SCATTER \ | |
984 | unsigned int bits = mask.toInt(); \ | |
985 | unsigned int low, high = 0; \ | |
986 | switch (_mm_popcnt_u32(bits)) { \ | |
987 | case 8: \ | |
988 | high = _bit_scan_reverse(bits); \ | |
989 | ith_value(high) = d.m(high); \ | |
990 | high = (1 << high); \ | |
991 | case 7: \ | |
992 | low = _bit_scan_forward(bits); \ | |
993 | bits ^= high | (1 << low); \ | |
994 | ith_value(low) = d.m(low); \ | |
995 | case 6: \ | |
996 | high = _bit_scan_reverse(bits); \ | |
997 | ith_value(high) = d.m(high); \ | |
998 | high = (1 << high); \ | |
999 | case 5: \ | |
1000 | low = _bit_scan_forward(bits); \ | |
1001 | bits ^= high | (1 << low); \ | |
1002 | ith_value(low) = d.m(low); \ | |
1003 | case 4: \ | |
1004 | high = _bit_scan_reverse(bits); \ | |
1005 | ith_value(high) = d.m(high); \ | |
1006 | high = (1 << high); \ | |
1007 | case 3: \ | |
1008 | low = _bit_scan_forward(bits); \ | |
1009 | bits ^= high | (1 << low); \ | |
1010 | ith_value(low) = d.m(low); \ | |
1011 | case 2: \ | |
1012 | high = _bit_scan_reverse(bits); \ | |
1013 | ith_value(high) = d.m(high); \ | |
1014 | case 1: \ | |
1015 | low = _bit_scan_forward(bits); \ | |
1016 | ith_value(low) = d.m(low); \ | |
1017 | case 0: \ | |
1018 | break; \ | |
1019 | } | |
1020 | #else | |
1021 | #define VC_MASKED_SCATTER \ | |
1022 | if (mask.isEmpty()) { \ | |
1023 | return; \ | |
1024 | } \ | |
1025 | for_all_vector_entries(i, \ | |
1026 | if (mask[i]) ith_value(i) = d.m(i); \ | |
1027 | ); | |
1028 | #endif | |
1029 | ||
c017a39f | 1030 | template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const |
f22341db | 1031 | { |
1032 | for_all_vector_entries(i, | |
1033 | mem[indexes[i]] = d.m(i); | |
1034 | ); | |
1035 | } | |
c017a39f | 1036 | #if defined(VC_MSVC) && VC_MSVC >= 170000000 |
1037 | // MSVC miscompiles the store mem[indexes[1]] = d.m(1) for T = (u)short | |
1038 | template<> template<typename Index> Vc_ALWAYS_INLINE void short_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const | |
1039 | { | |
1040 | const unsigned int tmp = d.v()._d.m128i_u32[0]; | |
1041 | mem[indexes[0]] = tmp & 0xffff; | |
1042 | mem[indexes[1]] = tmp >> 16; | |
1043 | mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); | |
1044 | mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); | |
1045 | mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); | |
1046 | mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); | |
1047 | mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); | |
1048 | mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); | |
1049 | } | |
1050 | template<> template<typename Index> Vc_ALWAYS_INLINE void ushort_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const | |
1051 | { | |
1052 | const unsigned int tmp = d.v()._d.m128i_u32[0]; | |
1053 | mem[indexes[0]] = tmp & 0xffff; | |
1054 | mem[indexes[1]] = tmp >> 16; | |
1055 | mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); | |
1056 | mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); | |
1057 | mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); | |
1058 | mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); | |
1059 | mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); | |
1060 | mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); | |
1061 | } | |
1062 | #endif | |
1063 | template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const | |
f22341db | 1064 | { |
1065 | #define ith_value(_i_) mem[indexes[_i_]] | |
1066 | VC_MASKED_SCATTER | |
1067 | #undef ith_value | |
1068 | } | |
c017a39f | 1069 | template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const |
f22341db | 1070 | { |
1071 | for_all_vector_entries(i, | |
1072 | array[indexes[i]].*(member1) = d.m(i); | |
1073 | ); | |
1074 | } | |
c017a39f | 1075 | template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const |
f22341db | 1076 | { |
1077 | #define ith_value(_i_) array[indexes[_i_]].*(member1) | |
1078 | VC_MASKED_SCATTER | |
1079 | #undef ith_value | |
1080 | } | |
c017a39f | 1081 | template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const |
f22341db | 1082 | { |
1083 | for_all_vector_entries(i, | |
1084 | array[indexes[i]].*(member1).*(member2) = d.m(i); | |
1085 | ); | |
1086 | } | |
c017a39f | 1087 | template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const |
f22341db | 1088 | { |
1089 | #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) | |
1090 | VC_MASKED_SCATTER | |
1091 | #undef ith_value | |
1092 | } | |
c017a39f | 1093 | template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const |
f22341db | 1094 | { |
1095 | for_all_vector_entries(i, | |
1096 | (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); | |
1097 | ); | |
1098 | } | |
c017a39f | 1099 | template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const |
f22341db | 1100 | { |
1101 | #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] | |
1102 | VC_MASKED_SCATTER | |
1103 | #undef ith_value | |
1104 | } | |
1105 | ||
1106 | /////////////////////////////////////////////////////////////////////////////////////////// | |
1107 | // operator- {{{1 | |
c017a39f | 1108 | template<> Vc_ALWAYS_INLINE Vector<double> Vc_PURE Vc_FLATTEN Vector<double>::operator-() const |
f22341db | 1109 | { |
1110 | return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd()); | |
1111 | } | |
c017a39f | 1112 | template<> Vc_ALWAYS_INLINE Vector<float> Vc_PURE Vc_FLATTEN Vector<float>::operator-() const |
f22341db | 1113 | { |
1114 | return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); | |
1115 | } | |
c017a39f | 1116 | template<> Vc_ALWAYS_INLINE Vector<sfloat> Vc_PURE Vc_FLATTEN Vector<sfloat>::operator-() const |
f22341db | 1117 | { |
1118 | return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); | |
1119 | } | |
c017a39f | 1120 | template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<int>::operator-() const |
f22341db | 1121 | { |
1122 | return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); | |
1123 | } | |
c017a39f | 1124 | template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<unsigned int>::operator-() const |
f22341db | 1125 | { |
1126 | return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); | |
1127 | } | |
c017a39f | 1128 | template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<short>::operator-() const |
f22341db | 1129 | { |
1130 | return _mm_sign_epi16(d.v(), _mm_setallone_si128()); | |
1131 | } | |
c017a39f | 1132 | template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<unsigned short>::operator-() const |
f22341db | 1133 | { |
1134 | return _mm_sign_epi16(d.v(), _mm_setallone_si128()); | |
1135 | } | |
1136 | ||
1137 | /////////////////////////////////////////////////////////////////////////////////////////// | |
1138 | // horizontal ops {{{1 | |
c017a39f | 1139 | template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const |
f22341db | 1140 | { |
1141 | Vector<T> tmp = std::numeric_limits<Vector<T> >::max(); | |
1142 | tmp(m) = *this; | |
1143 | return tmp.min(); | |
1144 | } | |
c017a39f | 1145 | template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const |
f22341db | 1146 | { |
1147 | Vector<T> tmp = std::numeric_limits<Vector<T> >::min(); | |
1148 | tmp(m) = *this; | |
1149 | return tmp.max(); | |
1150 | } | |
c017a39f | 1151 | template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const |
f22341db | 1152 | { |
1153 | Vector<T> tmp(VectorSpecialInitializerOne::One); | |
1154 | tmp(m) = *this; | |
1155 | return tmp.product(); | |
1156 | } | |
c017a39f | 1157 | template<typename T> Vc_ALWAYS_INLINE typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const |
f22341db | 1158 | { |
1159 | Vector<T> tmp(VectorSpecialInitializerZero::Zero); | |
1160 | tmp(m) = *this; | |
1161 | return tmp.sum(); | |
1162 | }//}}} | |
1163 | // copySign {{{1 | |
c017a39f | 1164 | template<> Vc_INTRINSIC Vector<float> Vector<float>::copySign(Vector<float>::AsArg reference) const |
f22341db | 1165 | { |
1166 | return _mm256_or_ps( | |
1167 | _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), | |
1168 | _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) | |
1169 | ); | |
1170 | } | |
c017a39f | 1171 | template<> Vc_INTRINSIC Vector<sfloat> Vector<sfloat>::copySign(Vector<sfloat>::AsArg reference) const |
f22341db | 1172 | { |
1173 | return _mm256_or_ps( | |
1174 | _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), | |
1175 | _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) | |
1176 | ); | |
1177 | } | |
c017a39f | 1178 | template<> Vc_INTRINSIC Vector<double> Vector<double>::copySign(Vector<double>::AsArg reference) const |
f22341db | 1179 | { |
1180 | return _mm256_or_pd( | |
1181 | _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()), | |
1182 | _mm256_and_pd(d.v(), _mm256_setabsmask_pd()) | |
1183 | ); | |
1184 | }//}}}1 | |
1185 | // exponent {{{1 | |
c017a39f | 1186 | template<> Vc_INTRINSIC Vector<float> Vector<float>::exponent() const |
f22341db | 1187 | { |
c017a39f | 1188 | VC_ASSERT((*this >= 0.f).isFull()); |
1189 | return Internal::exponent(d.v()); | |
f22341db | 1190 | } |
c017a39f | 1191 | template<> Vc_INTRINSIC Vector<sfloat> Vector<sfloat>::exponent() const |
f22341db | 1192 | { |
c017a39f | 1193 | VC_ASSERT((*this >= 0.f).isFull()); |
1194 | return Internal::exponent(d.v()); | |
f22341db | 1195 | } |
c017a39f | 1196 | template<> Vc_INTRINSIC Vector<double> Vector<double>::exponent() const |
f22341db | 1197 | { |
c017a39f | 1198 | VC_ASSERT((*this >= 0.).isFull()); |
1199 | return Internal::exponent(d.v()); | |
f22341db | 1200 | } |
1201 | // }}}1 | |
1202 | // Random {{{1 | |
c017a39f | 1203 | static Vc_ALWAYS_INLINE void _doRandomStep(Vector<unsigned int> &state0, |
f22341db | 1204 | Vector<unsigned int> &state1) |
1205 | { | |
1206 | state0.load(&Vc::RandomState[0]); | |
1207 | state1.load(&Vc::RandomState[uint_v::Size]); | |
1208 | (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); | |
1209 | uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); | |
1210 | } | |
1211 | ||
c017a39f | 1212 | template<typename T> Vc_ALWAYS_INLINE Vector<T> Vector<T>::Random() |
f22341db | 1213 | { |
1214 | Vector<unsigned int> state0, state1; | |
1215 | _doRandomStep(state0, state1); | |
1216 | return state0.reinterpretCast<Vector<T> >(); | |
1217 | } | |
1218 | ||
c017a39f | 1219 | template<> Vc_ALWAYS_INLINE Vector<float> Vector<float>::Random() |
f22341db | 1220 | { |
1221 | Vector<unsigned int> state0, state1; | |
1222 | _doRandomStep(state0, state1); | |
1223 | return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); | |
1224 | } | |
1225 | ||
c017a39f | 1226 | template<> Vc_ALWAYS_INLINE Vector<sfloat> Vector<sfloat>::Random() |
f22341db | 1227 | { |
1228 | Vector<unsigned int> state0, state1; | |
1229 | _doRandomStep(state0, state1); | |
1230 | return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); | |
1231 | } | |
1232 | ||
c017a39f | 1233 | template<> Vc_ALWAYS_INLINE Vector<double> Vector<double>::Random() |
f22341db | 1234 | { |
c017a39f | 1235 | const m256i state = VectorHelper<m256i>::load(&Vc::RandomState[0], Vc::Aligned); |
f22341db | 1236 | for (size_t k = 0; k < 8; k += 2) { |
c017a39f | 1237 | typedef unsigned long long uint64 Vc_MAY_ALIAS; |
f22341db | 1238 | const uint64 stateX = *reinterpret_cast<const uint64 *>(&Vc::RandomState[k]); |
1239 | *reinterpret_cast<uint64 *>(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11); | |
1240 | } | |
1241 | return (Vector<double>(_cast(_mm256_srli_epi64(state, 12))) | One()) - One(); | |
1242 | } | |
1243 | // }}}1 | |
c017a39f | 1244 | // shifted / rotated {{{1 |
1245 | template<size_t SIMDWidth, size_t Size, typename VectorType, typename EntryType> struct VectorShift; | |
1246 | template<> struct VectorShift<32, 4, m256d, double> | |
1247 | { | |
1248 | static Vc_INTRINSIC m256d shifted(param256d v, int amount) | |
1249 | { | |
1250 | switch (amount) { | |
1251 | case 0: return v; | |
1252 | case 1: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 1 * sizeof(double))); | |
1253 | case 2: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 2 * sizeof(double))); | |
1254 | case 3: return avx_cast<m256d>(_mm256_srli_si256(avx_cast<m256i>(v), 3 * sizeof(double))); | |
1255 | case -1: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 1 * sizeof(double))); | |
1256 | case -2: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 2 * sizeof(double))); | |
1257 | case -3: return avx_cast<m256d>(_mm256_slli_si256(avx_cast<m256i>(v), 3 * sizeof(double))); | |
1258 | } | |
1259 | return _mm256_setzero_pd(); | |
1260 | } | |
1261 | }; | |
1262 | template<typename VectorType, typename EntryType> struct VectorShift<32, 8, VectorType, EntryType> | |
1263 | { | |
1264 | typedef typename SseVectorType<VectorType>::Type SmallV; | |
1265 | static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) | |
1266 | { | |
1267 | switch (amount) { | |
1268 | case 0: return v; | |
1269 | case 1: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 1 * sizeof(EntryType))); | |
1270 | case 2: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 2 * sizeof(EntryType))); | |
1271 | case 3: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 3 * sizeof(EntryType))); | |
1272 | case 4: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 4 * sizeof(EntryType))); | |
1273 | case 5: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 5 * sizeof(EntryType))); | |
1274 | case 6: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 6 * sizeof(EntryType))); | |
1275 | case 7: return avx_cast<VectorType>(_mm256_srli_si256(avx_cast<m256i>(v), 7 * sizeof(EntryType))); | |
1276 | case -1: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 1 * sizeof(EntryType))); | |
1277 | case -2: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 2 * sizeof(EntryType))); | |
1278 | case -3: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 3 * sizeof(EntryType))); | |
1279 | case -4: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 4 * sizeof(EntryType))); | |
1280 | case -5: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 5 * sizeof(EntryType))); | |
1281 | case -6: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 6 * sizeof(EntryType))); | |
1282 | case -7: return avx_cast<VectorType>(_mm256_slli_si256(avx_cast<m256i>(v), 7 * sizeof(EntryType))); | |
1283 | } | |
1284 | return avx_cast<VectorType>(_mm256_setzero_ps()); | |
1285 | } | |
1286 | }; | |
1287 | template<typename VectorType, typename EntryType> struct VectorShift<16, 8, VectorType, EntryType> | |
1288 | { | |
79c86c14 | 1289 | enum { |
1290 | EntryTypeSizeof = sizeof(EntryType) | |
1291 | }; | |
c017a39f | 1292 | static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) |
1293 | { | |
1294 | switch (amount) { | |
1295 | case 0: return v; | |
79c86c14 | 1296 | case 1: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 1 * EntryTypeSizeof)); |
1297 | case 2: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 2 * EntryTypeSizeof)); | |
1298 | case 3: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 3 * EntryTypeSizeof)); | |
1299 | case 4: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 4 * EntryTypeSizeof)); | |
1300 | case 5: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 5 * EntryTypeSizeof)); | |
1301 | case 6: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 6 * EntryTypeSizeof)); | |
1302 | case 7: return avx_cast<VectorType>(_mm_srli_si128(avx_cast<m128i>(v), 7 * EntryTypeSizeof)); | |
1303 | case -1: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 1 * EntryTypeSizeof)); | |
1304 | case -2: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 2 * EntryTypeSizeof)); | |
1305 | case -3: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 3 * EntryTypeSizeof)); | |
1306 | case -4: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 4 * EntryTypeSizeof)); | |
1307 | case -5: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 5 * EntryTypeSizeof)); | |
1308 | case -6: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 6 * EntryTypeSizeof)); | |
1309 | case -7: return avx_cast<VectorType>(_mm_slli_si128(avx_cast<m128i>(v), 7 * EntryTypeSizeof)); | |
c017a39f | 1310 | } |
1311 | return _mm_setzero_si128(); | |
1312 | } | |
1313 | }; | |
1314 | template<typename T> Vc_INTRINSIC Vector<T> Vector<T>::shifted(int amount) const | |
1315 | { | |
1316 | return VectorShift<sizeof(VectorType), Size, VectorType, EntryType>::shifted(d.v(), amount); | |
1317 | } | |
1318 | template<size_t SIMDWidth, size_t Size, typename VectorType, typename EntryType> struct VectorRotate; | |
1319 | template<typename VectorType, typename EntryType> struct VectorRotate<32, 4, VectorType, EntryType> | |
1320 | { | |
1321 | typedef typename SseVectorType<VectorType>::Type SmallV; | |
79c86c14 | 1322 | enum { |
1323 | EntryTypeSizeof = sizeof(EntryType) | |
1324 | }; | |
c017a39f | 1325 | static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) |
1326 | { | |
1327 | const m128i vLo = avx_cast<m128i>(lo128(v)); | |
1328 | const m128i vHi = avx_cast<m128i>(hi128(v)); | |
1329 | switch (static_cast<unsigned int>(amount) % 4) { | |
1330 | case 0: return v; | |
79c86c14 | 1331 | case 1: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); |
c017a39f | 1332 | case 2: return Mem::permute128<X1, X0>(v); |
79c86c14 | 1333 | case 3: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); |
c017a39f | 1334 | } |
1335 | return _mm256_setzero_pd(); | |
1336 | } | |
1337 | }; | |
1338 | template<typename VectorType, typename EntryType> struct VectorRotate<32, 8, VectorType, EntryType> | |
1339 | { | |
1340 | typedef typename SseVectorType<VectorType>::Type SmallV; | |
79c86c14 | 1341 | enum { |
1342 | EntryTypeSizeof = sizeof(EntryType) | |
1343 | }; | |
c017a39f | 1344 | static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) |
1345 | { | |
1346 | const m128i vLo = avx_cast<m128i>(lo128(v)); | |
1347 | const m128i vHi = avx_cast<m128i>(hi128(v)); | |
1348 | switch (static_cast<unsigned int>(amount) % 8) { | |
1349 | case 0: return v; | |
79c86c14 | 1350 | case 1: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); |
1351 | case 2: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof))); | |
1352 | case 3: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof))); | |
c017a39f | 1353 | case 4: return Mem::permute128<X1, X0>(v); |
79c86c14 | 1354 | case 5: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); |
1355 | case 6: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof))); | |
1356 | case 7: return concat(avx_cast<SmallV>(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof)), avx_cast<SmallV>(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof))); | |
c017a39f | 1357 | } |
1358 | return avx_cast<VectorType>(_mm256_setzero_ps()); | |
1359 | } | |
1360 | }; | |
1361 | template<typename VectorType, typename EntryType> struct VectorRotate<16, 8, VectorType, EntryType> | |
1362 | { | |
79c86c14 | 1363 | enum { |
1364 | EntryTypeSizeof = sizeof(EntryType) | |
1365 | }; | |
c017a39f | 1366 | static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) |
1367 | { | |
1368 | switch (static_cast<unsigned int>(amount) % 8) { | |
1369 | case 0: return v; | |
79c86c14 | 1370 | case 1: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 1 * EntryTypeSizeof)); |
1371 | case 2: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 2 * EntryTypeSizeof)); | |
1372 | case 3: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 3 * EntryTypeSizeof)); | |
1373 | case 4: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 4 * EntryTypeSizeof)); | |
1374 | case 5: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 5 * EntryTypeSizeof)); | |
1375 | case 6: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 6 * EntryTypeSizeof)); | |
1376 | case 7: return avx_cast<VectorType>(_mm_alignr_epi8(v, v, 7 * EntryTypeSizeof)); | |
c017a39f | 1377 | } |
1378 | return _mm_setzero_si128(); | |
1379 | } | |
1380 | }; | |
1381 | template<typename T> Vc_INTRINSIC Vector<T> Vector<T>::rotated(int amount) const | |
1382 | { | |
1383 | return VectorRotate<sizeof(VectorType), Size, VectorType, EntryType>::rotated(d.v(), amount); | |
1384 | /* | |
1385 | const m128i v0 = avx_cast<m128i>(d.v()[0]); | |
1386 | const m128i v1 = avx_cast<m128i>(d.v()[1]); | |
1387 | switch (static_cast<unsigned int>(amount) % Size) { | |
1388 | case 0: return *this; | |
1389 | case 1: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType)))); | |
1390 | case 2: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType)))); | |
1391 | case 3: return concat(avx_cast<m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType)))); | |
1392 | case 4: return concat(d.v()[1], d.v()[0]); | |
1393 | case 5: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType)))); | |
1394 | case 6: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType)))); | |
1395 | case 7: return concat(avx_cast<m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), avx_cast<m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType)))); | |
1396 | } | |
1397 | */ | |
1398 | } | |
1399 | // }}}1 | |
f22341db | 1400 | } // namespace AVX |
1401 | } // namespace Vc | |
c017a39f | 1402 | } // namespace AliRoot |
f22341db | 1403 | |
1404 | #include "undomacros.h" | |
1405 | ||
1406 | // vim: foldmethod=marker |