]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2010-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #include "limits.h" | |
21 | #include "../common/bitscanintrinsics.h" | |
22 | #include "macros.h" | |
23 | ||
c017a39f | 24 | namespace AliRoot { |
f22341db | 25 | namespace Vc |
26 | { | |
27 | ALIGN(64) extern unsigned int RandomState[16]; | |
28 | ||
29 | namespace SSE | |
30 | { | |
31 | ||
c017a39f | 32 | template<typename T, int Size> static Vc_ALWAYS_INLINE Vc_CONST const T *_IndexesFromZero() { |
f22341db | 33 | if (Size == 4) { |
34 | return reinterpret_cast<const T *>(_IndexesFromZero4); | |
35 | } else if (Size == 8) { | |
36 | return reinterpret_cast<const T *>(_IndexesFromZero8); | |
37 | } else if (Size == 16) { | |
38 | return reinterpret_cast<const T *>(_IndexesFromZero16); | |
39 | } | |
40 | return 0; | |
41 | } | |
42 | ||
43 | /////////////////////////////////////////////////////////////////////////////////////////// | |
44 | // constants {{{1 | |
c017a39f | 45 | template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum) |
f22341db | 46 | : d(VectorHelper<VectorType>::zero()) |
47 | { | |
48 | } | |
49 | ||
c017a39f | 50 | template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerOne::OEnum) |
f22341db | 51 | : d(VectorHelper<T>::one()) |
52 | { | |
53 | } | |
54 | ||
c017a39f | 55 | template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) |
f22341db | 56 | : d(VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned)) |
57 | { | |
58 | } | |
59 | ||
c017a39f | 60 | template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::Zero() |
f22341db | 61 | { |
62 | return VectorHelper<VectorType>::zero(); | |
63 | } | |
64 | ||
c017a39f | 65 | template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::One() |
f22341db | 66 | { |
67 | return VectorHelper<T>::one(); | |
68 | } | |
69 | ||
c017a39f | 70 | template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::IndexesFromZero() |
f22341db | 71 | { |
72 | return VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned); | |
73 | } | |
74 | ||
75 | // conversion/casts {{{1 | |
c017a39f | 76 | template<typename T> template<typename OtherT> Vc_INTRINSIC Vector<T>::Vector(const Vector<OtherT> &x) |
f22341db | 77 | : d(StaticCastHelper<OtherT, T>::cast(x.data())) |
78 | { | |
79 | } | |
80 | ||
c017a39f | 81 | template<> template<> Vc_INTRINSIC short_v &Vector<short>::operator=(const ushort_v &x) { |
f22341db | 82 | data() = StaticCastHelper<unsigned short, short>::cast(x.data()); return *this; |
83 | } | |
c017a39f | 84 | template<> template<> Vc_INTRINSIC ushort_v &Vector<unsigned short>::operator=(const short_v &x) { |
f22341db | 85 | data() = StaticCastHelper<short, unsigned short>::cast(x.data()); return *this; |
86 | } | |
c017a39f | 87 | template<> template<> Vc_INTRINSIC int_v &Vector<int>::operator=(const uint_v &x) { |
f22341db | 88 | data() = StaticCastHelper<unsigned int, int>::cast(x.data()); return *this; |
89 | } | |
c017a39f | 90 | template<> template<> Vc_INTRINSIC uint_v &Vector<unsigned int>::operator=(const int_v &x) { |
f22341db | 91 | data() = StaticCastHelper<int, unsigned int>::cast(x.data()); return *this; |
92 | } | |
93 | ||
94 | // broadcasts {{{1 | |
c017a39f | 95 | template<typename T> Vc_INTRINSIC Vector<T>::Vector(EntryType a) |
f22341db | 96 | : d(VectorHelper<T>::set(a)) |
97 | { | |
98 | } | |
99 | ||
100 | /////////////////////////////////////////////////////////////////////////////////////////// | |
101 | // load ctors {{{1 | |
c017a39f | 102 | template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); } |
103 | template<typename T> template<typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); } | |
104 | template<typename T> template<typename OtherT> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); } | |
105 | template<typename T> template<typename OtherT, typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); } | |
f22341db | 106 | |
107 | /////////////////////////////////////////////////////////////////////////////////////////// | |
108 | // load member functions {{{1 | |
c017a39f | 109 | template<typename T> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem) |
f22341db | 110 | { |
111 | load(mem, Aligned); | |
112 | } | |
113 | ||
c017a39f | 114 | template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem, A align) |
f22341db | 115 | { |
116 | d.v() = VectorHelper<VectorType>::load(mem, align); | |
117 | } | |
118 | ||
c017a39f | 119 | template<typename T> template<typename OtherT> Vc_INTRINSIC void Vector<T>::load(const OtherT *mem) |
f22341db | 120 | { |
121 | load(mem, Aligned); | |
122 | } | |
123 | ||
124 | // float8: simply use the float implementation twice {{{2 | |
c017a39f | 125 | template<> template<typename OtherT, typename A> Vc_INTRINSIC void Vector<float8>::load(const OtherT *x, A a) |
f22341db | 126 | { |
127 | d.v() = M256::create( | |
128 | Vector<float>(&x[0], a).data(), | |
129 | Vector<float>(&x[4], a).data() | |
130 | ); | |
131 | } | |
132 | ||
133 | // LoadHelper {{{2 | |
134 | template<typename DstT, typename SrcT, typename Flags> struct LoadHelper; | |
135 | ||
136 | // float {{{2 | |
137 | template<typename Flags> struct LoadHelper<float, double, Flags> { | |
c017a39f | 138 | static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const double *mem, Flags f) |
f22341db | 139 | { |
140 | return _mm_movelh_ps(_mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[0], f)), | |
141 | _mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[2], f))); | |
142 | } | |
143 | }; | |
144 | template<typename Flags> struct LoadHelper<float, unsigned int, Flags> { | |
c017a39f | 145 | static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned int *mem, Flags f) |
f22341db | 146 | { |
147 | return StaticCastHelper<unsigned int, float>::cast(VectorHelper<__m128i>::load(mem, f)); | |
148 | } | |
149 | }; | |
150 | template<typename Flags> struct LoadHelper<float, int, Flags> { | |
c017a39f | 151 | static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const int *mem, Flags f) |
f22341db | 152 | { |
153 | return StaticCastHelper<int, float>::cast(VectorHelper<__m128i>::load(mem, f)); | |
154 | } | |
155 | }; | |
156 | template<typename Flags> struct LoadHelper<float, unsigned short, Flags> { | |
c017a39f | 157 | static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned short *mem, Flags f) |
f22341db | 158 | { |
159 | return _mm_cvtepi32_ps(LoadHelper<int, unsigned short, Flags>::load(mem, f)); | |
160 | } | |
161 | }; | |
162 | template<typename Flags> struct LoadHelper<float, short, Flags> { | |
c017a39f | 163 | static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const short *mem, Flags f) |
f22341db | 164 | { |
165 | return _mm_cvtepi32_ps(LoadHelper<int, short, Flags>::load(mem, f)); | |
166 | } | |
167 | }; | |
168 | template<typename Flags> struct LoadHelper<float, unsigned char, Flags> { | |
c017a39f | 169 | static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned char *mem, Flags f) |
f22341db | 170 | { |
171 | return _mm_cvtepi32_ps(LoadHelper<int, unsigned char, Flags>::load(mem, f)); | |
172 | } | |
173 | }; | |
174 | template<typename Flags> struct LoadHelper<float, signed char, Flags> { | |
c017a39f | 175 | static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const signed char *mem, Flags f) |
f22341db | 176 | { |
177 | return _mm_cvtepi32_ps(LoadHelper<int, signed char, Flags>::load(mem, f)); | |
178 | } | |
179 | }; | |
180 | ||
181 | // int {{{2 | |
182 | template<typename Flags> struct LoadHelper<int, unsigned int, Flags> { | |
c017a39f | 183 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned int *mem, Flags f) |
f22341db | 184 | { |
185 | return VectorHelper<__m128i>::load(mem, f); | |
186 | } | |
187 | }; | |
188 | // no difference between streaming and alignment, because the | |
189 | // 32/64 bit loads are not available as streaming loads, and can always be unaligned | |
190 | template<typename Flags> struct LoadHelper<int, unsigned short, Flags> { | |
c017a39f | 191 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags) |
f22341db | 192 | { |
79c86c14 | 193 | return mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))); |
f22341db | 194 | } |
195 | }; | |
196 | template<typename Flags> struct LoadHelper<int, short, Flags> { | |
c017a39f | 197 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const short *mem, Flags) |
f22341db | 198 | { |
79c86c14 | 199 | return mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))); |
f22341db | 200 | } |
201 | }; | |
202 | template<typename Flags> struct LoadHelper<int, unsigned char, Flags> { | |
c017a39f | 203 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) |
f22341db | 204 | { |
79c86c14 | 205 | return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem))); |
f22341db | 206 | } |
207 | }; | |
208 | template<typename Flags> struct LoadHelper<int, signed char, Flags> { | |
c017a39f | 209 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags) |
f22341db | 210 | { |
79c86c14 | 211 | return mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem))); |
f22341db | 212 | } |
213 | }; | |
214 | ||
215 | // unsigned int {{{2 | |
216 | template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> { | |
c017a39f | 217 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags) |
f22341db | 218 | { |
79c86c14 | 219 | return mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))); |
f22341db | 220 | } |
221 | }; | |
222 | template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> { | |
c017a39f | 223 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) |
f22341db | 224 | { |
79c86c14 | 225 | return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem))); |
f22341db | 226 | } |
227 | }; | |
228 | ||
229 | // short {{{2 | |
230 | template<typename Flags> struct LoadHelper<short, unsigned short, Flags> { | |
c017a39f | 231 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags f) |
f22341db | 232 | { |
233 | return VectorHelper<__m128i>::load(mem, f); | |
234 | } | |
235 | }; | |
236 | template<typename Flags> struct LoadHelper<short, unsigned char, Flags> { | |
c017a39f | 237 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) |
f22341db | 238 | { |
79c86c14 | 239 | return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))); |
f22341db | 240 | } |
241 | }; | |
242 | template<typename Flags> struct LoadHelper<short, signed char, Flags> { | |
c017a39f | 243 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags) |
f22341db | 244 | { |
79c86c14 | 245 | return mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))); |
f22341db | 246 | } |
247 | }; | |
248 | ||
249 | // unsigned short {{{2 | |
250 | template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> { | |
c017a39f | 251 | static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) |
f22341db | 252 | { |
79c86c14 | 253 | return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))); |
f22341db | 254 | } |
255 | }; | |
256 | ||
257 | // general load, implemented via LoadHelper {{{2 | |
c017a39f | 258 | template<typename DstT> template<typename SrcT, typename Flags> Vc_INTRINSIC void Vector<DstT>::load(const SrcT *x, Flags f) |
f22341db | 259 | { |
260 | d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f); | |
261 | } | |
262 | ||
263 | /////////////////////////////////////////////////////////////////////////////////////////// | |
264 | // expand/combine {{{1 | |
c017a39f | 265 | template<typename T> Vc_INTRINSIC Vector<T>::Vector(const Vector<typename CtorTypeHelper<T>::Type> *a) |
f22341db | 266 | : d(VectorHelper<T>::concat(a[0].data(), a[1].data())) |
267 | { | |
268 | } | |
269 | ||
270 | template<typename T> inline void Vector<T>::expand(Vector<typename ExpandTypeHelper<T>::Type> *x) const | |
271 | { | |
272 | if (Size == 8u) { | |
273 | x[0].data() = VectorHelper<T>::expand0(data()); | |
274 | x[1].data() = VectorHelper<T>::expand1(data()); | |
275 | } | |
276 | } | |
277 | ||
278 | /////////////////////////////////////////////////////////////////////////////////////////// | |
279 | // zeroing {{{1 | |
c017a39f | 280 | template<typename T> Vc_INTRINSIC void Vector<T>::setZero() |
f22341db | 281 | { |
282 | data() = VectorHelper<VectorType>::zero(); | |
283 | } | |
284 | ||
c017a39f | 285 | template<typename T> Vc_INTRINSIC void Vector<T>::setZero(const Mask &k) |
f22341db | 286 | { |
287 | data() = VectorHelper<VectorType>::andnot_(mm128_reinterpret_cast<VectorType>(k.data()), data()); | |
288 | } | |
289 | ||
c017a39f | 290 | template<> Vc_INTRINSIC void Vector<double>::setQnan() |
f22341db | 291 | { |
292 | data() = _mm_setallone_pd(); | |
293 | } | |
c017a39f | 294 | template<> Vc_INTRINSIC void Vector<double>::setQnan(Mask::Argument k) |
f22341db | 295 | { |
296 | data() = _mm_or_pd(data(), k.dataD()); | |
297 | } | |
c017a39f | 298 | template<> Vc_INTRINSIC void Vector<float>::setQnan() |
f22341db | 299 | { |
300 | data() = _mm_setallone_ps(); | |
301 | } | |
c017a39f | 302 | template<> Vc_INTRINSIC void Vector<float>::setQnan(Mask::Argument k) |
f22341db | 303 | { |
304 | data() = _mm_or_ps(data(), k.data()); | |
305 | } | |
c017a39f | 306 | template<> Vc_INTRINSIC void Vector<float8>::setQnan() |
f22341db | 307 | { |
308 | d.v()[0] = _mm_setallone_ps(); | |
309 | d.v()[1] = _mm_setallone_ps(); | |
310 | } | |
c017a39f | 311 | template<> Vc_INTRINSIC void Vector<float8>::setQnan(Mask::Argument k) |
f22341db | 312 | { |
313 | d.v()[0] = _mm_or_ps(d.v()[0], k.data()[0]); | |
314 | d.v()[1] = _mm_or_ps(d.v()[1], k.data()[1]); | |
315 | } | |
316 | ||
317 | /////////////////////////////////////////////////////////////////////////////////////////// | |
318 | // stores {{{1 | |
c017a39f | 319 | template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem) const |
f22341db | 320 | { |
321 | VectorHelper<VectorType>::store(mem, data(), Aligned); | |
322 | } | |
323 | ||
c017a39f | 324 | template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask) const |
f22341db | 325 | { |
326 | VectorHelper<VectorType>::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), Aligned); | |
327 | } | |
328 | ||
c017a39f | 329 | template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, A align) const |
f22341db | 330 | { |
331 | VectorHelper<VectorType>::store(mem, data(), align); | |
332 | } | |
333 | ||
c017a39f | 334 | template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask, A align) const |
f22341db | 335 | { |
336 | HV::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), align); | |
337 | } | |
338 | ||
339 | /////////////////////////////////////////////////////////////////////////////////////////// | |
340 | // division {{{1 | |
c017a39f | 341 | template<typename T> Vc_INTRINSIC Vector<T> &WriteMaskedVector<T>::operator/=(const Vector<T> &x) |
f22341db | 342 | { |
343 | return operator=(*vec / x); | |
344 | } | |
c017a39f | 345 | template<> Vc_INTRINSIC int_v &WriteMaskedVector<int>::operator/=(const int_v &x) |
f22341db | 346 | { |
347 | Vc_foreach_bit (int i, mask) { | |
348 | vec->d.m(i) /= x.d.m(i); | |
349 | } | |
350 | return *vec; | |
351 | } | |
c017a39f | 352 | template<> Vc_INTRINSIC uint_v &WriteMaskedVector<unsigned int>::operator/=(const uint_v &x) |
f22341db | 353 | { |
354 | Vc_foreach_bit (int i, mask) { | |
355 | vec->d.m(i) /= x.d.m(i); | |
356 | } | |
357 | return *vec; | |
358 | } | |
c017a39f | 359 | template<> Vc_INTRINSIC short_v &WriteMaskedVector<short>::operator/=(const short_v &x) |
f22341db | 360 | { |
361 | Vc_foreach_bit (int i, mask) { | |
362 | vec->d.m(i) /= x.d.m(i); | |
363 | } | |
364 | return *vec; | |
365 | } | |
c017a39f | 366 | template<> Vc_INTRINSIC ushort_v &WriteMaskedVector<unsigned short>::operator/=(const ushort_v &x) |
f22341db | 367 | { |
368 | Vc_foreach_bit (int i, mask) { | |
369 | vec->d.m(i) /= x.d.m(i); | |
370 | } | |
371 | return *vec; | |
372 | } | |
373 | ||
374 | template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x) | |
375 | { | |
376 | if (VectorTraits<T>::HasVectorDivision) { | |
377 | return operator/=(Vector<T>(x)); | |
378 | } | |
379 | for_all_vector_entries(i, | |
380 | d.m(i) /= x; | |
381 | ); | |
382 | return *this; | |
383 | } | |
384 | ||
c017a39f | 385 | template<typename T> template<typename TT> Vc_INTRINSIC Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const |
f22341db | 386 | { |
387 | if (VectorTraits<T>::HasVectorDivision) { | |
388 | return operator/(Vector<T>(x)); | |
389 | } | |
390 | Vector<T> r; | |
391 | for_all_vector_entries(i, | |
392 | r.d.m(i) = d.m(i) / x; | |
393 | ); | |
394 | return r; | |
395 | } | |
396 | ||
397 | template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x) | |
398 | { | |
399 | for_all_vector_entries(i, | |
400 | d.m(i) /= x.d.m(i); | |
401 | ); | |
402 | return *this; | |
403 | } | |
404 | ||
c017a39f | 405 | template<typename T> inline Vc_PURE Vector<T> Vector<T>::operator/(const Vector<T> &x) const |
f22341db | 406 | { |
407 | Vector<T> r; | |
408 | for_all_vector_entries(i, | |
409 | r.d.m(i) = d.m(i) / x.d.m(i); | |
410 | ); | |
411 | return r; | |
412 | } | |
413 | ||
414 | template<> inline Vector<short> &Vector<short>::operator/=(const Vector<short> &x) | |
415 | { | |
416 | __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v())); | |
417 | __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v())); | |
418 | lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v()))); | |
419 | hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v()))); | |
420 | d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); | |
421 | return *this; | |
422 | } | |
423 | ||
c017a39f | 424 | template<> inline Vc_PURE Vector<short> Vector<short>::operator/(const Vector<short> &x) const |
f22341db | 425 | { |
426 | __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v())); | |
427 | __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v())); | |
428 | lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v()))); | |
429 | hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v()))); | |
430 | return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); | |
431 | } | |
432 | ||
433 | template<> inline Vector<unsigned short> &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x) | |
434 | { | |
435 | __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v())); | |
436 | __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v())); | |
437 | lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v()))); | |
438 | hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v()))); | |
439 | d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); | |
440 | return *this; | |
441 | } | |
442 | ||
c017a39f | 443 | template<> Vc_ALWAYS_INLINE Vc_PURE Vector<unsigned short> Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const |
f22341db | 444 | { |
445 | __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v())); | |
446 | __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v())); | |
447 | lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v()))); | |
448 | hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v()))); | |
449 | return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); | |
450 | } | |
451 | ||
c017a39f | 452 | template<> Vc_ALWAYS_INLINE Vector<float> &Vector<float>::operator/=(const Vector<float> &x) |
f22341db | 453 | { |
454 | d.v() = _mm_div_ps(d.v(), x.d.v()); | |
455 | return *this; | |
456 | } | |
457 | ||
c017a39f | 458 | template<> Vc_ALWAYS_INLINE Vc_PURE Vector<float> Vector<float>::operator/(const Vector<float> &x) const |
f22341db | 459 | { |
460 | return _mm_div_ps(d.v(), x.d.v()); | |
461 | } | |
462 | ||
c017a39f | 463 | template<> Vc_ALWAYS_INLINE Vector<float8> &Vector<float8>::operator/=(const Vector<float8> &x) |
f22341db | 464 | { |
465 | d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]); | |
466 | d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]); | |
467 | return *this; | |
468 | } | |
469 | ||
c017a39f | 470 | template<> Vc_ALWAYS_INLINE Vc_PURE Vector<float8> Vector<float8>::operator/(const Vector<float8> &x) const |
f22341db | 471 | { |
472 | Vector<float8> r; | |
473 | r.d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]); | |
474 | r.d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]); | |
475 | return r; | |
476 | } | |
477 | ||
c017a39f | 478 | template<> Vc_ALWAYS_INLINE Vector<double> &Vector<double>::operator/=(const Vector<double> &x) |
f22341db | 479 | { |
480 | d.v() = _mm_div_pd(d.v(), x.d.v()); | |
481 | return *this; | |
482 | } | |
483 | ||
c017a39f | 484 | template<> Vc_ALWAYS_INLINE Vc_PURE Vector<double> Vector<double>::operator/(const Vector<double> &x) const |
f22341db | 485 | { |
486 | return _mm_div_pd(d.v(), x.d.v()); | |
487 | } | |
488 | ||
489 | /////////////////////////////////////////////////////////////////////////////////////////// | |
490 | // operator- {{{1 | |
c017a39f | 491 | template<> Vc_ALWAYS_INLINE Vector<double> Vc_PURE Vc_FLATTEN Vector<double>::operator-() const |
f22341db | 492 | { |
493 | return _mm_xor_pd(d.v(), _mm_setsignmask_pd()); | |
494 | } | |
c017a39f | 495 | template<> Vc_ALWAYS_INLINE Vector<float> Vc_PURE Vc_FLATTEN Vector<float>::operator-() const |
f22341db | 496 | { |
497 | return _mm_xor_ps(d.v(), _mm_setsignmask_ps()); | |
498 | } | |
c017a39f | 499 | template<> Vc_ALWAYS_INLINE Vector<float8> Vc_PURE Vc_FLATTEN Vector<float8>::operator-() const |
f22341db | 500 | { |
501 | return M256::create( | |
502 | _mm_xor_ps(d.v()[0], _mm_setsignmask_ps()), | |
503 | _mm_xor_ps(d.v()[1], _mm_setsignmask_ps())); | |
504 | } | |
c017a39f | 505 | template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<int>::operator-() const |
f22341db | 506 | { |
507 | #ifdef VC_IMPL_SSSE3 | |
508 | return _mm_sign_epi32(d.v(), _mm_setallone_si128()); | |
509 | #else | |
510 | return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32()); | |
511 | #endif | |
512 | } | |
c017a39f | 513 | template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<unsigned int>::operator-() const |
f22341db | 514 | { |
515 | #ifdef VC_IMPL_SSSE3 | |
516 | return _mm_sign_epi32(d.v(), _mm_setallone_si128()); | |
517 | #else | |
518 | return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32()); | |
519 | #endif | |
520 | } | |
c017a39f | 521 | template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<short>::operator-() const |
f22341db | 522 | { |
523 | #ifdef VC_IMPL_SSSE3 | |
524 | return _mm_sign_epi16(d.v(), _mm_setallone_si128()); | |
525 | #else | |
526 | return _mm_mullo_epi16(d.v(), _mm_setallone_si128()); | |
527 | #endif | |
528 | } | |
c017a39f | 529 | template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<unsigned short>::operator-() const |
f22341db | 530 | { |
531 | #ifdef VC_IMPL_SSSE3 | |
532 | return _mm_sign_epi16(d.v(), _mm_setallone_si128()); | |
533 | #else | |
534 | return _mm_mullo_epi16(d.v(), _mm_setallone_si128()); | |
535 | #endif | |
536 | } | |
537 | ||
538 | /////////////////////////////////////////////////////////////////////////////////////////// | |
539 | // integer ops {{{1 | |
540 | #define OP_IMPL(T, symbol, fun) \ | |
c017a39f | 541 | template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(const Vector<T> &x) \ |
f22341db | 542 | { \ |
543 | d.v() = VectorHelper<T>::fun(d.v(), x.d.v()); \ | |
544 | return *this; \ | |
545 | } \ | |
c017a39f | 546 | template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator symbol(const Vector<T> &x) const \ |
f22341db | 547 | { \ |
548 | return VectorHelper<T>::fun(d.v(), x.d.v()); \ | |
549 | } | |
550 | OP_IMPL(int, &, and_) | |
551 | OP_IMPL(int, |, or_) | |
552 | OP_IMPL(int, ^, xor_) | |
553 | OP_IMPL(unsigned int, &, and_) | |
554 | OP_IMPL(unsigned int, |, or_) | |
555 | OP_IMPL(unsigned int, ^, xor_) | |
556 | OP_IMPL(short, &, and_) | |
557 | OP_IMPL(short, |, or_) | |
558 | OP_IMPL(short, ^, xor_) | |
559 | OP_IMPL(unsigned short, &, and_) | |
560 | OP_IMPL(unsigned short, |, or_) | |
561 | OP_IMPL(unsigned short, ^, xor_) | |
562 | OP_IMPL(float, &, and_) | |
563 | OP_IMPL(float, |, or_) | |
564 | OP_IMPL(float, ^, xor_) | |
565 | OP_IMPL(float8, &, and_) | |
566 | OP_IMPL(float8, |, or_) | |
567 | OP_IMPL(float8, ^, xor_) | |
568 | OP_IMPL(double, &, and_) | |
569 | OP_IMPL(double, |, or_) | |
570 | OP_IMPL(double, ^, xor_) | |
571 | #undef OP_IMPL | |
572 | ||
573 | #ifdef VC_IMPL_XOP | |
c017a39f | 574 | static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const int_v &value, const int_v &count) { return _mm_sha_epi32(value.data(), count.data()); } |
575 | static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const uint_v &value, const uint_v &count) { return _mm_shl_epi32(value.data(), count.data()); } | |
576 | static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const short_v &value, const short_v &count) { return _mm_sha_epi16(value.data(), count.data()); } | |
577 | static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const ushort_v &value, const ushort_v &count) { return _mm_shl_epi16(value.data(), count.data()); } | |
578 | static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const int_v &value, const int_v &count) { return shiftLeft(value, -count ); } | |
579 | static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const uint_v &value, const uint_v &count) { return shiftLeft(value, uint_v(-count)); } | |
580 | static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const short_v &value, const short_v &count) { return shiftLeft(value, -count ); } | |
581 | static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const ushort_v &value, const ushort_v &count) { return shiftLeft(value, ushort_v(-count)); } | |
f22341db | 582 | |
583 | #define _VC_OP(T, symbol, impl) \ | |
c017a39f | 584 | template<> Vc_INTRINSIC T &T::operator symbol##=(T::AsArg shift) \ |
f22341db | 585 | { \ |
586 | d.v() = impl(*this, shift); \ | |
587 | return *this; \ | |
588 | } \ | |
c017a39f | 589 | template<> Vc_INTRINSIC Vc_PURE T T::operator symbol (T::AsArg shift) const \ |
f22341db | 590 | { \ |
591 | return impl(*this, shift); \ | |
592 | } | |
593 | VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, <<, shiftLeft) | |
594 | VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, >>, shiftRight) | |
595 | #undef _VC_OP | |
596 | #else | |
c017a39f | 597 | #if defined(VC_GCC) && VC_GCC == 0x40600 && defined(VC_IMPL_XOP) |
f22341db | 598 | #define VC_WORKAROUND __attribute__((optimize("no-tree-vectorize"),weak)) |
599 | #else | |
c017a39f | 600 | #define VC_WORKAROUND Vc_INTRINSIC |
f22341db | 601 | #endif |
602 | ||
603 | #define OP_IMPL(T, symbol) \ | |
c017a39f | 604 | template<> VC_WORKAROUND Vector<T> &Vector<T>::operator symbol##=(Vector<T>::AsArg x) \ |
f22341db | 605 | { \ |
606 | for_all_vector_entries(i, \ | |
607 | d.m(i) symbol##= x.d.m(i); \ | |
608 | ); \ | |
609 | return *this; \ | |
610 | } \ | |
c017a39f | 611 | template<> inline Vc_PURE Vector<T> Vector<T>::operator symbol(Vector<T>::AsArg x) const \ |
f22341db | 612 | { \ |
613 | Vector<T> r; \ | |
614 | for_all_vector_entries(i, \ | |
615 | r.d.m(i) = d.m(i) symbol x.d.m(i); \ | |
616 | ); \ | |
617 | return r; \ | |
618 | } | |
619 | OP_IMPL(int, <<) | |
620 | OP_IMPL(int, >>) | |
621 | OP_IMPL(unsigned int, <<) | |
622 | OP_IMPL(unsigned int, >>) | |
623 | OP_IMPL(short, <<) | |
624 | OP_IMPL(short, >>) | |
625 | OP_IMPL(unsigned short, <<) | |
626 | OP_IMPL(unsigned short, >>) | |
627 | #undef OP_IMPL | |
628 | #undef VC_WORKAROUND | |
f22341db | 629 | #endif |
630 | ||
c017a39f | 631 | template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator>>=(int shift) { |
f22341db | 632 | d.v() = VectorHelper<T>::shiftRight(d.v(), shift); |
633 | return *this; | |
634 | } | |
c017a39f | 635 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator>>(int shift) const { |
f22341db | 636 | return VectorHelper<T>::shiftRight(d.v(), shift); |
637 | } | |
c017a39f | 638 | template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator<<=(int shift) { |
f22341db | 639 | d.v() = VectorHelper<T>::shiftLeft(d.v(), shift); |
640 | return *this; | |
641 | } | |
c017a39f | 642 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator<<(int shift) const { |
f22341db | 643 | return VectorHelper<T>::shiftLeft(d.v(), shift); |
644 | } | |
645 | ||
646 | /////////////////////////////////////////////////////////////////////////////////////////// | |
647 | // swizzles {{{1 | |
c017a39f | 648 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> &Vector<T>::abcd() const { return *this; } |
649 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); } | |
650 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); } | |
651 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); } | |
652 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); } | |
653 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); } | |
654 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); } | |
655 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); } | |
656 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); } | |
657 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); } | |
658 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); } | |
659 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); } | |
660 | template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); } | |
661 | ||
662 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::cdab() const { return M256::create(Mem::permute<X2, X3, X0, X1>(d.v()[0]), Mem::permute<X2, X3, X0, X1>(d.v()[1])); } | |
663 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::badc() const { return M256::create(Mem::permute<X1, X0, X3, X2>(d.v()[0]), Mem::permute<X1, X0, X3, X2>(d.v()[1])); } | |
664 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::aaaa() const { return M256::create(Mem::permute<X0, X0, X0, X0>(d.v()[0]), Mem::permute<X0, X0, X0, X0>(d.v()[1])); } | |
665 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bbbb() const { return M256::create(Mem::permute<X1, X1, X1, X1>(d.v()[0]), Mem::permute<X1, X1, X1, X1>(d.v()[1])); } | |
666 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::cccc() const { return M256::create(Mem::permute<X2, X2, X2, X2>(d.v()[0]), Mem::permute<X2, X2, X2, X2>(d.v()[1])); } | |
667 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dddd() const { return M256::create(Mem::permute<X3, X3, X3, X3>(d.v()[0]), Mem::permute<X3, X3, X3, X3>(d.v()[1])); } | |
668 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bcad() const { return M256::create(Mem::permute<X1, X2, X0, X3>(d.v()[0]), Mem::permute<X1, X2, X0, X3>(d.v()[1])); } | |
669 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bcda() const { return M256::create(Mem::permute<X1, X2, X3, X0>(d.v()[0]), Mem::permute<X1, X2, X3, X0>(d.v()[1])); } | |
670 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dabc() const { return M256::create(Mem::permute<X3, X0, X1, X2>(d.v()[0]), Mem::permute<X3, X0, X1, X2>(d.v()[1])); } | |
671 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::acbd() const { return M256::create(Mem::permute<X0, X2, X1, X3>(d.v()[0]), Mem::permute<X0, X2, X1, X3>(d.v()[1])); } | |
672 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dbca() const { return M256::create(Mem::permute<X3, X1, X2, X0>(d.v()[0]), Mem::permute<X3, X1, X2, X0>(d.v()[1])); } | |
673 | template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dcba() const { return M256::create(Mem::permute<X3, X2, X1, X0>(d.v()[0]), Mem::permute<X3, X2, X1, X0>(d.v()[1])); } | |
f22341db | 674 | |
675 | #define VC_SWIZZLES_16BIT_IMPL(T) \ | |
c017a39f | 676 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \ |
677 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \ | |
678 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \ | |
679 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \ | |
680 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \ | |
681 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \ | |
682 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \ | |
683 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \ | |
684 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \ | |
685 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \ | |
686 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \ | |
687 | template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); } | |
f22341db | 688 | VC_SWIZZLES_16BIT_IMPL(short) |
689 | VC_SWIZZLES_16BIT_IMPL(unsigned short) | |
690 | #undef VC_SWIZZLES_16BIT_IMPL | |
691 | ||
692 | // operators {{{1 | |
693 | #include "../common/operators.h" | |
c017a39f | 694 | // isNegative {{{1 |
695 | template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const | |
696 | { | |
697 | return sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v())), 31)); | |
698 | } | |
699 | template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const | |
700 | { | |
701 | return M256::create( | |
702 | sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[0])), 31)), | |
703 | sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[1])), 31)) | |
704 | ); | |
705 | } | |
706 | template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const | |
707 | { | |
708 | return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>( | |
709 | _mm_srai_epi32(sse_cast<__m128i>(_mm_and_pd(_mm_setsignmask_pd(), d.v())), 31) | |
710 | )); | |
711 | } | |
f22341db | 712 | // gathers {{{1 |
c017a39f | 713 | template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes) |
f22341db | 714 | { |
715 | gather(mem, indexes); | |
716 | } | |
c017a39f | 717 | template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes) |
f22341db | 718 | { |
719 | gather(mem, indexes); | |
720 | } | |
721 | ||
c017a39f | 722 | template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) |
f22341db | 723 | : d(HT::zero()) |
724 | { | |
725 | gather(mem, indexes, mask); | |
726 | } | |
727 | ||
c017a39f | 728 | template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes, MaskArg mask) |
f22341db | 729 | : d(HT::zero()) |
730 | { | |
731 | gather(mem, indexes, mask); | |
732 | } | |
733 | ||
c017a39f | 734 | template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 735 | { |
736 | gather(array, member1, indexes); | |
737 | } | |
c017a39f | 738 | template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) |
f22341db | 739 | : d(HT::zero()) |
740 | { | |
741 | gather(array, member1, indexes, mask); | |
742 | } | |
c017a39f | 743 | template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 744 | { |
745 | gather(array, member1, member2, indexes); | |
746 | } | |
c017a39f | 747 | template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) |
f22341db | 748 | : d(HT::zero()) |
749 | { | |
750 | gather(array, member1, member2, indexes, mask); | |
751 | } | |
c017a39f | 752 | template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 753 | { |
754 | gather(array, ptrMember1, outerIndexes, innerIndexes); | |
755 | } | |
c017a39f | 756 | template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) |
f22341db | 757 | : d(HT::zero()) |
758 | { | |
759 | gather(array, ptrMember1, outerIndexes, innerIndexes, mask); | |
760 | } | |
761 | ||
762 | template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} }; | |
763 | template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size> | |
764 | { | |
765 | static void check() { | |
766 | VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); | |
767 | } | |
768 | }; | |
c017a39f | 769 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 770 | { |
771 | IndexSizeChecker<Index, Size>::check(); | |
772 | d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]); | |
773 | } | |
c017a39f | 774 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 775 | { |
776 | IndexSizeChecker<Index, Size>::check(); | |
777 | d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); | |
778 | } | |
c017a39f | 779 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 780 | { |
781 | IndexSizeChecker<Index, Size>::check(); | |
782 | d.v()[0] = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); | |
783 | d.v()[1] = _mm_setr_ps(mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
784 | } | |
c017a39f | 785 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 786 | { |
787 | IndexSizeChecker<Index, Size>::check(); | |
788 | d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); | |
789 | } | |
c017a39f | 790 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 791 | { |
792 | IndexSizeChecker<Index, Size>::check(); | |
793 | d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); | |
794 | } | |
c017a39f | 795 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 796 | { |
797 | IndexSizeChecker<Index, Size>::check(); | |
798 | d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], | |
799 | mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
800 | } | |
c017a39f | 801 | template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) |
f22341db | 802 | { |
803 | IndexSizeChecker<Index, Size>::check(); | |
804 | d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], | |
805 | mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); | |
806 | } | |
807 | ||
808 | #ifdef VC_USE_SET_GATHERS | |
c017a39f | 809 | template<typename T> template<typename IT> Vc_ALWAYS_INLINE void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IT>) indexes, MaskArg mask) |
f22341db | 810 | { |
811 | IndexSizeChecker<Vector<IT>, Size>::check(); | |
c017a39f | 812 | Vector<IT> indexesTmp = indexes; |
813 | indexesTmp.setZero(!static_cast<typename Vector<IT>::Mask>(mask)); | |
814 | (*this)(mask) = Vector<T>(mem, indexesTmp); | |
f22341db | 815 | } |
816 | #endif | |
817 | ||
818 | #ifdef VC_USE_BSF_GATHERS | |
819 | #define VC_MASKED_GATHER \ | |
820 | int bits = mask.toInt(); \ | |
821 | while (bits) { \ | |
822 | const int i = _bit_scan_forward(bits); \ | |
823 | bits &= ~(1 << i); /* btr? */ \ | |
824 | d.m(i) = ith_value(i); \ | |
825 | } | |
826 | #elif defined(VC_USE_POPCNT_BSF_GATHERS) | |
827 | #define VC_MASKED_GATHER \ | |
828 | unsigned int bits = mask.toInt(); \ | |
829 | unsigned int low, high = 0; \ | |
830 | switch (mask.count()) { \ | |
831 | case 8: \ | |
832 | high = _bit_scan_reverse(bits); \ | |
833 | d.m(high) = ith_value(high); \ | |
834 | high = (1 << high); \ | |
835 | case 7: \ | |
836 | low = _bit_scan_forward(bits); \ | |
837 | bits ^= high | (1 << low); \ | |
838 | d.m(low) = ith_value(low); \ | |
839 | case 6: \ | |
840 | high = _bit_scan_reverse(bits); \ | |
841 | d.m(high) = ith_value(high); \ | |
842 | high = (1 << high); \ | |
843 | case 5: \ | |
844 | low = _bit_scan_forward(bits); \ | |
845 | bits ^= high | (1 << low); \ | |
846 | d.m(low) = ith_value(low); \ | |
847 | case 4: \ | |
848 | high = _bit_scan_reverse(bits); \ | |
849 | d.m(high) = ith_value(high); \ | |
850 | high = (1 << high); \ | |
851 | case 3: \ | |
852 | low = _bit_scan_forward(bits); \ | |
853 | bits ^= high | (1 << low); \ | |
854 | d.m(low) = ith_value(low); \ | |
855 | case 2: \ | |
856 | high = _bit_scan_reverse(bits); \ | |
857 | d.m(high) = ith_value(high); \ | |
858 | case 1: \ | |
859 | low = _bit_scan_forward(bits); \ | |
860 | d.m(low) = ith_value(low); \ | |
861 | case 0: \ | |
862 | break; \ | |
863 | } | |
864 | #else | |
865 | #define VC_MASKED_GATHER \ | |
866 | if (mask.isEmpty()) { \ | |
867 | return; \ | |
868 | } \ | |
869 | for_all_vector_entries(i, \ | |
870 | if (mask[i]) d.m(i) = ith_value(i); \ | |
871 | ); | |
872 | #endif | |
873 | ||
874 | template<typename T> template<typename Index> | |
c017a39f | 875 | Vc_INTRINSIC void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) |
f22341db | 876 | { |
877 | IndexSizeChecker<Index, Size>::check(); | |
878 | #define ith_value(_i_) (mem[indexes[_i_]]) | |
879 | VC_MASKED_GATHER | |
880 | #undef ith_value | |
881 | } | |
882 | ||
883 | template<> template<typename S1, typename IT> | |
c017a39f | 884 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 885 | { |
886 | IndexSizeChecker<IT, Size>::check(); | |
887 | d.v() = _mm_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1)); | |
888 | } | |
889 | template<> template<typename S1, typename IT> | |
c017a39f | 890 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 891 | { |
892 | IndexSizeChecker<IT, Size>::check(); | |
893 | d.v() = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
894 | array[indexes[3]].*(member1)); | |
895 | } | |
896 | template<> template<typename S1, typename IT> | |
c017a39f | 897 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 898 | { |
899 | IndexSizeChecker<IT, Size>::check(); | |
900 | d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
901 | array[indexes[3]].*(member1)); | |
902 | d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), | |
903 | array[indexes[7]].*(member1)); | |
904 | } | |
905 | template<> template<typename S1, typename IT> | |
c017a39f | 906 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 907 | { |
908 | IndexSizeChecker<IT, Size>::check(); | |
909 | d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
910 | array[indexes[3]].*(member1)); | |
911 | } | |
912 | template<> template<typename S1, typename IT> | |
c017a39f | 913 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 914 | { |
915 | IndexSizeChecker<IT, Size>::check(); | |
916 | d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
917 | array[indexes[3]].*(member1)); | |
918 | } | |
919 | template<> template<typename S1, typename IT> | |
c017a39f | 920 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 921 | { |
922 | IndexSizeChecker<IT, Size>::check(); | |
923 | d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
924 | array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), | |
925 | array[indexes[6]].*(member1), array[indexes[7]].*(member1)); | |
926 | } | |
927 | template<> template<typename S1, typename IT> | |
c017a39f | 928 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 929 | { |
930 | IndexSizeChecker<IT, Size>::check(); | |
931 | d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), | |
932 | array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), | |
933 | array[indexes[6]].*(member1), array[indexes[7]].*(member1)); | |
934 | } | |
935 | template<typename T> template<typename S1, typename IT> | |
c017a39f | 936 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) |
f22341db | 937 | { |
938 | IndexSizeChecker<IT, Size>::check(); | |
939 | #define ith_value(_i_) (array[indexes[_i_]].*(member1)) | |
940 | VC_MASKED_GATHER | |
941 | #undef ith_value | |
942 | } | |
943 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 944 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 945 | { |
946 | IndexSizeChecker<IT, Size>::check(); | |
947 | d.v() = _mm_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2)); | |
948 | } | |
949 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 950 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 951 | { |
952 | IndexSizeChecker<IT, Size>::check(); | |
953 | d.v() = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
954 | array[indexes[3]].*(member1).*(member2)); | |
955 | } | |
956 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 957 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 958 | { |
959 | IndexSizeChecker<IT, Size>::check(); | |
960 | d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), | |
961 | array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); | |
962 | d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
963 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
964 | } | |
965 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 966 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 967 | { |
968 | IndexSizeChecker<IT, Size>::check(); | |
969 | d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), | |
970 | array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); | |
971 | } | |
972 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 973 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 974 | { |
975 | IndexSizeChecker<IT, Size>::check(); | |
976 | d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), | |
977 | array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); | |
978 | } | |
979 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 980 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 981 | { |
982 | IndexSizeChecker<IT, Size>::check(); | |
983 | d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
984 | array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
985 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
986 | } | |
987 | template<> template<typename S1, typename S2, typename IT> | |
c017a39f | 988 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) |
f22341db | 989 | { |
990 | IndexSizeChecker<IT, Size>::check(); | |
991 | d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), | |
992 | array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), | |
993 | array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); | |
994 | } | |
995 | template<typename T> template<typename S1, typename S2, typename IT> | |
c017a39f | 996 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) |
f22341db | 997 | { |
998 | IndexSizeChecker<IT, Size>::check(); | |
999 | #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) | |
1000 | VC_MASKED_GATHER | |
1001 | #undef ith_value | |
1002 | } | |
1003 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 1004 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 1005 | { |
1006 | IndexSizeChecker<IT1, Size>::check(); | |
1007 | IndexSizeChecker<IT2, Size>::check(); | |
1008 | d.v() = _mm_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], | |
1009 | (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]]); | |
1010 | } | |
1011 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 1012 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 1013 | { |
1014 | IndexSizeChecker<IT1, Size>::check(); | |
1015 | IndexSizeChecker<IT2, Size>::check(); | |
1016 | d.v() = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
1017 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); | |
1018 | } | |
1019 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 1020 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 1021 | { |
1022 | IndexSizeChecker<IT1, Size>::check(); | |
1023 | IndexSizeChecker<IT2, Size>::check(); | |
1024 | d.v()[0] = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
1025 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); | |
1026 | d.v()[1] = _mm_setr_ps((array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
1027 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
1028 | } | |
1029 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 1030 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 1031 | { |
1032 | IndexSizeChecker<IT1, Size>::check(); | |
1033 | IndexSizeChecker<IT2, Size>::check(); | |
1034 | d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
1035 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); | |
1036 | } | |
1037 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 1038 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 1039 | { |
1040 | IndexSizeChecker<IT1, Size>::check(); | |
1041 | IndexSizeChecker<IT2, Size>::check(); | |
1042 | d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
1043 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); | |
1044 | } | |
1045 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 1046 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 1047 | { |
1048 | IndexSizeChecker<IT1, Size>::check(); | |
1049 | IndexSizeChecker<IT2, Size>::check(); | |
1050 | d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
1051 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], | |
1052 | (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
1053 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
1054 | } | |
1055 | template<> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 1056 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) |
f22341db | 1057 | { |
1058 | IndexSizeChecker<IT1, Size>::check(); | |
1059 | IndexSizeChecker<IT2, Size>::check(); | |
1060 | d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], | |
1061 | (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], | |
1062 | (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], | |
1063 | (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); | |
1064 | } | |
1065 | template<typename T> template<typename S1, typename IT1, typename IT2> | |
c017a39f | 1066 | Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) |
f22341db | 1067 | { |
1068 | IndexSizeChecker<IT1, Size>::check(); | |
1069 | IndexSizeChecker<IT2, Size>::check(); | |
1070 | #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] | |
1071 | VC_MASKED_GATHER | |
1072 | #undef ith_value | |
1073 | } | |
1074 | // scatters {{{1 | |
1075 | #undef VC_MASKED_GATHER | |
1076 | #ifdef VC_USE_BSF_SCATTERS | |
1077 | #define VC_MASKED_SCATTER \ | |
1078 | int bits = mask.toInt(); \ | |
1079 | while (bits) { \ | |
1080 | const int i = _bit_scan_forward(bits); \ | |
1081 | bits ^= (1 << i); /* btr? */ \ | |
1082 | ith_value(i) = d.m(i); \ | |
1083 | } | |
1084 | #elif defined(VC_USE_POPCNT_BSF_SCATTERS) | |
1085 | #define VC_MASKED_SCATTER \ | |
1086 | unsigned int bits = mask.toInt(); \ | |
1087 | unsigned int low, high = 0; \ | |
1088 | switch (mask.count()) { \ | |
1089 | case 8: \ | |
1090 | high = _bit_scan_reverse(bits); \ | |
1091 | ith_value(high) = d.m(high); \ | |
1092 | high = (1 << high); \ | |
1093 | case 7: \ | |
1094 | low = _bit_scan_forward(bits); \ | |
1095 | bits ^= high | (1 << low); \ | |
1096 | ith_value(low) = d.m(low); \ | |
1097 | case 6: \ | |
1098 | high = _bit_scan_reverse(bits); \ | |
1099 | ith_value(high) = d.m(high); \ | |
1100 | high = (1 << high); \ | |
1101 | case 5: \ | |
1102 | low = _bit_scan_forward(bits); \ | |
1103 | bits ^= high | (1 << low); \ | |
1104 | ith_value(low) = d.m(low); \ | |
1105 | case 4: \ | |
1106 | high = _bit_scan_reverse(bits); \ | |
1107 | ith_value(high) = d.m(high); \ | |
1108 | high = (1 << high); \ | |
1109 | case 3: \ | |
1110 | low = _bit_scan_forward(bits); \ | |
1111 | bits ^= high | (1 << low); \ | |
1112 | ith_value(low) = d.m(low); \ | |
1113 | case 2: \ | |
1114 | high = _bit_scan_reverse(bits); \ | |
1115 | ith_value(high) = d.m(high); \ | |
1116 | case 1: \ | |
1117 | low = _bit_scan_forward(bits); \ | |
1118 | ith_value(low) = d.m(low); \ | |
1119 | case 0: \ | |
1120 | break; \ | |
1121 | } | |
1122 | #else | |
1123 | #define VC_MASKED_SCATTER \ | |
1124 | if (mask.isEmpty()) { \ | |
1125 | return; \ | |
1126 | } \ | |
1127 | for_all_vector_entries(i, \ | |
1128 | if (mask[i]) ith_value(i) = d.m(i); \ | |
1129 | ); | |
1130 | #endif | |
1131 | ||
c017a39f | 1132 | template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const |
f22341db | 1133 | { |
1134 | for_all_vector_entries(i, | |
1135 | mem[indexes[i]] = d.m(i); | |
1136 | ); | |
1137 | } | |
c017a39f | 1138 | template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const |
f22341db | 1139 | { |
1140 | #define ith_value(_i_) mem[indexes[_i_]] | |
1141 | VC_MASKED_SCATTER | |
1142 | #undef ith_value | |
1143 | } | |
c017a39f | 1144 | template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const |
f22341db | 1145 | { |
1146 | for_all_vector_entries(i, | |
1147 | array[indexes[i]].*(member1) = d.m(i); | |
1148 | ); | |
1149 | } | |
c017a39f | 1150 | template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const |
f22341db | 1151 | { |
1152 | #define ith_value(_i_) array[indexes[_i_]].*(member1) | |
1153 | VC_MASKED_SCATTER | |
1154 | #undef ith_value | |
1155 | } | |
c017a39f | 1156 | template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const |
f22341db | 1157 | { |
1158 | for_all_vector_entries(i, | |
1159 | array[indexes[i]].*(member1).*(member2) = d.m(i); | |
1160 | ); | |
1161 | } | |
c017a39f | 1162 | template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const |
f22341db | 1163 | { |
1164 | #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) | |
1165 | VC_MASKED_SCATTER | |
1166 | #undef ith_value | |
1167 | } | |
c017a39f | 1168 | template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const |
f22341db | 1169 | { |
1170 | for_all_vector_entries(i, | |
1171 | (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); | |
1172 | ); | |
1173 | } | |
c017a39f | 1174 | template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const |
f22341db | 1175 | { |
1176 | #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] | |
1177 | VC_MASKED_SCATTER | |
1178 | #undef ith_value | |
1179 | } | |
1180 | ||
1181 | /////////////////////////////////////////////////////////////////////////////////////////// | |
1182 | // operator[] {{{1 | |
c017a39f | 1183 | template<typename T> Vc_INTRINSIC typename Vector<T>::EntryType Vc_PURE Vector<T>::operator[](size_t index) const |
f22341db | 1184 | { |
1185 | return d.m(index); | |
1186 | } | |
1187 | #ifdef VC_GCC | |
c017a39f | 1188 | template<> Vc_INTRINSIC double Vc_PURE Vector<double>::operator[](size_t index) const |
f22341db | 1189 | { |
1190 | if (__builtin_constant_p(index)) { | |
1191 | return extract_double_imm(d.v(), index); | |
1192 | } | |
1193 | return d.m(index); | |
1194 | } | |
c017a39f | 1195 | template<> Vc_INTRINSIC float Vc_PURE Vector<float>::operator[](size_t index) const |
f22341db | 1196 | { |
1197 | return extract_float(d.v(), index); | |
1198 | } | |
c017a39f | 1199 | template<> Vc_INTRINSIC float Vc_PURE Vector<float8>::operator[](size_t index) const |
f22341db | 1200 | { |
1201 | if (__builtin_constant_p(index)) { | |
1202 | if (index < 4) { | |
1203 | return extract_float_imm(d.v()[0], index); | |
1204 | } | |
1205 | return extract_float_imm(d.v()[1], index - 4); | |
1206 | } | |
1207 | return d.m(index); | |
1208 | } | |
c017a39f | 1209 | template<> Vc_INTRINSIC int Vc_PURE Vector<int>::operator[](size_t index) const |
f22341db | 1210 | { |
1211 | if (__builtin_constant_p(index)) { | |
1212 | #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following | |
1213 | #ifdef __x86_64__ | |
1214 | if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull; | |
1215 | if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32; | |
1216 | #else | |
1217 | if (index == 0) return _mm_cvtsi128_si32(d.v()); | |
1218 | #endif | |
1219 | #endif | |
1220 | #ifdef VC_IMPL_SSE4_1 | |
1221 | return _mm_extract_epi32(d.v(), index); | |
1222 | #else | |
1223 | return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4)); | |
1224 | #endif | |
1225 | } | |
1226 | return d.m(index); | |
1227 | } | |
c017a39f | 1228 | template<> Vc_INTRINSIC unsigned int Vc_PURE Vector<unsigned int>::operator[](size_t index) const |
f22341db | 1229 | { |
1230 | if (__builtin_constant_p(index)) { | |
1231 | #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following | |
1232 | #ifdef __x86_64__ | |
1233 | if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull; | |
1234 | if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32; | |
1235 | #else | |
1236 | if (index == 0) return _mm_cvtsi128_si32(d.v()); | |
1237 | #endif | |
1238 | #endif | |
1239 | #ifdef VC_IMPL_SSE4_1 | |
1240 | return _mm_extract_epi32(d.v(), index); | |
1241 | #else | |
1242 | return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4)); | |
1243 | #endif | |
1244 | } | |
1245 | return d.m(index); | |
1246 | } | |
c017a39f | 1247 | template<> Vc_INTRINSIC short Vc_PURE Vector<short>::operator[](size_t index) const |
f22341db | 1248 | { |
1249 | if (__builtin_constant_p(index)) { | |
1250 | return _mm_extract_epi16(d.v(), index); | |
1251 | } | |
1252 | return d.m(index); | |
1253 | } | |
c017a39f | 1254 | template<> Vc_INTRINSIC unsigned short Vc_PURE Vector<unsigned short>::operator[](size_t index) const |
f22341db | 1255 | { |
1256 | if (__builtin_constant_p(index)) { | |
1257 | return _mm_extract_epi16(d.v(), index); | |
1258 | } | |
1259 | return d.m(index); | |
1260 | } | |
1261 | #endif // GCC | |
1262 | /////////////////////////////////////////////////////////////////////////////////////////// | |
1263 | // horizontal ops {{{1 | |
1264 | #ifndef VC_IMPL_SSE4_1 | |
1265 | // without SSE4.1 integer multiplication is slow and we rather multiply the scalars | |
c017a39f | 1266 | template<> Vc_INTRINSIC Vc_PURE int Vector<int>::product() const |
f22341db | 1267 | { |
1268 | return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); | |
1269 | } | |
c017a39f | 1270 | template<> Vc_INTRINSIC Vc_PURE unsigned int Vector<unsigned int>::product() const |
f22341db | 1271 | { |
1272 | return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); | |
1273 | } | |
1274 | #endif | |
c017a39f | 1275 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const |
f22341db | 1276 | { |
1277 | Vector<T> tmp = std::numeric_limits<Vector<T> >::max(); | |
1278 | tmp(m) = *this; | |
1279 | return tmp.min(); | |
1280 | } | |
c017a39f | 1281 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const |
f22341db | 1282 | { |
1283 | Vector<T> tmp = std::numeric_limits<Vector<T> >::min(); | |
1284 | tmp(m) = *this; | |
1285 | return tmp.max(); | |
1286 | } | |
c017a39f | 1287 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const |
f22341db | 1288 | { |
1289 | Vector<T> tmp(VectorSpecialInitializerOne::One); | |
1290 | tmp(m) = *this; | |
1291 | return tmp.product(); | |
1292 | } | |
c017a39f | 1293 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const |
f22341db | 1294 | { |
1295 | Vector<T> tmp(VectorSpecialInitializerZero::Zero); | |
1296 | tmp(m) = *this; | |
1297 | return tmp.sum(); | |
1298 | } | |
1299 | ||
1300 | /////////////////////////////////////////////////////////////////////////////////////////// | |
1301 | // copySign {{{1 | |
c017a39f | 1302 | template<> Vc_INTRINSIC Vc_PURE Vector<float> Vector<float>::copySign(Vector<float>::AsArg reference) const |
f22341db | 1303 | { |
1304 | return _mm_or_ps( | |
1305 | _mm_and_ps(reference.d.v(), _mm_setsignmask_ps()), | |
1306 | _mm_and_ps(d.v(), _mm_setabsmask_ps()) | |
1307 | ); | |
1308 | } | |
c017a39f | 1309 | template<> Vc_INTRINSIC Vc_PURE Vector<float8> Vector<float8>::copySign(Vector<float8>::AsArg reference) const |
f22341db | 1310 | { |
1311 | return M256::create( _mm_or_ps( | |
1312 | _mm_and_ps(reference.d.v()[0], _mm_setsignmask_ps()), | |
1313 | _mm_and_ps(d.v()[0], _mm_setabsmask_ps()) | |
1314 | ), _mm_or_ps( | |
1315 | _mm_and_ps(reference.d.v()[1], _mm_setsignmask_ps()), | |
1316 | _mm_and_ps(d.v()[1], _mm_setabsmask_ps()) | |
1317 | ) | |
1318 | ); | |
1319 | } | |
c017a39f | 1320 | template<> Vc_INTRINSIC Vc_PURE Vector<double> Vector<double>::copySign(Vector<double>::AsArg reference) const |
f22341db | 1321 | { |
1322 | return _mm_or_pd( | |
1323 | _mm_and_pd(reference.d.v(), _mm_setsignmask_pd()), | |
1324 | _mm_and_pd(d.v(), _mm_setabsmask_pd()) | |
1325 | ); | |
1326 | }//}}}1 | |
1327 | // exponent {{{1 | |
c017a39f | 1328 | template<> Vc_INTRINSIC Vc_PURE Vector<float> Vector<float>::exponent() const |
f22341db | 1329 | { |
c017a39f | 1330 | VC_ASSERT((*this >= 0.f).isFull()); |
1331 | return Internal::exponent(d.v()); | |
f22341db | 1332 | } |
c017a39f | 1333 | template<> Vc_INTRINSIC Vc_PURE Vector<float8> Vector<float8>::exponent() const |
f22341db | 1334 | { |
c017a39f | 1335 | VC_ASSERT((*this >= 0.f).isFull()); |
1336 | return Internal::exponent(d.v()); | |
f22341db | 1337 | } |
c017a39f | 1338 | template<> Vc_INTRINSIC Vc_PURE Vector<double> Vector<double>::exponent() const |
f22341db | 1339 | { |
c017a39f | 1340 | VC_ASSERT((*this >= 0.).isFull()); |
1341 | return Internal::exponent(d.v()); | |
f22341db | 1342 | } |
1343 | // }}}1 | |
1344 | // Random {{{1 | |
c017a39f | 1345 | static void _doRandomStep(Vector<unsigned int> &state0, |
f22341db | 1346 | Vector<unsigned int> &state1) |
1347 | { | |
1348 | state0.load(&Vc::RandomState[0]); | |
1349 | state1.load(&Vc::RandomState[uint_v::Size]); | |
1350 | (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); | |
1351 | uint_v(_mm_xor_si128((state0 * 0xdeece66du + 11).data(), _mm_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); | |
1352 | } | |
1353 | ||
c017a39f | 1354 | template<typename T> Vc_ALWAYS_INLINE Vector<T> Vector<T>::Random() |
f22341db | 1355 | { |
1356 | Vector<unsigned int> state0, state1; | |
1357 | _doRandomStep(state0, state1); | |
1358 | return state0.reinterpretCast<Vector<T> >(); | |
1359 | } | |
1360 | ||
c017a39f | 1361 | template<> Vc_ALWAYS_INLINE Vector<float> Vector<float>::Random() |
f22341db | 1362 | { |
1363 | Vector<unsigned int> state0, state1; | |
1364 | _doRandomStep(state0, state1); | |
1365 | return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); | |
1366 | } | |
1367 | ||
c017a39f | 1368 | template<> Vc_ALWAYS_INLINE Vector<float8> Vector<float8>::Random() |
f22341db | 1369 | { |
1370 | Vector<unsigned int> state0, state1; | |
1371 | _doRandomStep(state0, state1); | |
1372 | state1 ^= state0 >> 16; | |
1373 | return M256::create( | |
1374 | _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one()), | |
1375 | _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state1.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one()) | |
1376 | ); | |
1377 | } | |
1378 | ||
c017a39f | 1379 | template<> Vc_ALWAYS_INLINE Vector<double> Vector<double>::Random() |
f22341db | 1380 | { |
c017a39f | 1381 | typedef unsigned long long uint64 Vc_MAY_ALIAS; |
f22341db | 1382 | uint64 state0 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[8]); |
1383 | uint64 state1 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[10]); | |
1384 | const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Vc::RandomState[8])); | |
1385 | *reinterpret_cast<uint64 *>(&Vc::RandomState[ 8]) = (state0 * 0x5deece66dull + 11); | |
1386 | *reinterpret_cast<uint64 *>(&Vc::RandomState[10]) = (state1 * 0x5deece66dull + 11); | |
1387 | return (Vector<double>(_mm_castsi128_pd(_mm_srli_epi64(state, 12))) | One()) - One(); | |
1388 | } | |
c017a39f | 1389 | // shifted / rotated {{{1 |
1390 | template<typename T> Vc_INTRINSIC Vc_PURE Vector<T> Vector<T>::shifted(int amount) const | |
1391 | { | |
79c86c14 | 1392 | enum { |
1393 | EntryTypeSizeof = sizeof(EntryType) | |
1394 | }; | |
c017a39f | 1395 | switch (amount) { |
1396 | case 0: return *this; | |
79c86c14 | 1397 | case 1: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof)); |
1398 | case 2: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof)); | |
1399 | case 3: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof)); | |
1400 | case 4: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof)); | |
1401 | case 5: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof)); | |
1402 | case 6: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof)); | |
1403 | case 7: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof)); | |
1404 | case 8: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof)); | |
1405 | case -1: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof)); | |
1406 | case -2: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof)); | |
1407 | case -3: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof)); | |
1408 | case -4: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof)); | |
1409 | case -5: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof)); | |
1410 | case -6: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof)); | |
1411 | case -7: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof)); | |
1412 | case -8: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof)); | |
c017a39f | 1413 | } |
1414 | return Zero(); | |
1415 | } | |
1416 | template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::shifted(int amount) const | |
1417 | { | |
79c86c14 | 1418 | enum { |
1419 | EntryTypeSizeof = sizeof(EntryType) | |
1420 | }; | |
c017a39f | 1421 | switch (amount) { |
79c86c14 | 1422 | case -7: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof))); |
1423 | case -6: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof))); | |
1424 | case -5: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof))); | |
c017a39f | 1425 | case -4: return M256::create(_mm_setzero_ps(), d.v()[0]); |
79c86c14 | 1426 | case -3: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof))); |
1427 | case -2: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof))); | |
1428 | case -1: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof))); | |
c017a39f | 1429 | case 0: return *this; |
79c86c14 | 1430 | case 1: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof))); |
1431 | case 2: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof))); | |
1432 | case 3: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof))); | |
c017a39f | 1433 | case 4: return M256::create(d.v()[1], _mm_setzero_ps()); |
79c86c14 | 1434 | case 5: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof)), _mm_setzero_ps()); |
1435 | case 6: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof)), _mm_setzero_ps()); | |
1436 | case 7: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof)), _mm_setzero_ps()); | |
c017a39f | 1437 | } |
1438 | return Zero(); | |
1439 | } | |
1440 | template<typename T> Vc_INTRINSIC Vc_PURE Vector<T> Vector<T>::rotated(int amount) const | |
1441 | { | |
79c86c14 | 1442 | enum { |
1443 | EntryTypeSizeof = sizeof(EntryType) | |
1444 | }; | |
c017a39f | 1445 | const __m128i v = mm128_reinterpret_cast<__m128i>(d.v()); |
1446 | switch (static_cast<unsigned int>(amount) % Size) { | |
1447 | case 0: return *this; | |
79c86c14 | 1448 | case 1: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 1 * EntryTypeSizeof)); |
1449 | case 2: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 2 * EntryTypeSizeof)); | |
1450 | case 3: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 3 * EntryTypeSizeof)); | |
c017a39f | 1451 | // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake. |
1452 | // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType)) | |
1453 | // disables the following four calls unless sizeof(EntryType) == 2. | |
79c86c14 | 1454 | case 4: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 4 * EntryTypeSizeof)); |
1455 | case 5: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 5 * EntryTypeSizeof)); | |
1456 | case 6: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 6 * EntryTypeSizeof)); | |
1457 | case 7: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 7 * EntryTypeSizeof)); | |
c017a39f | 1458 | } |
1459 | return Zero(); | |
1460 | } | |
1461 | template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::rotated(int amount) const | |
1462 | { | |
79c86c14 | 1463 | enum { |
1464 | EntryTypeSizeof = sizeof(EntryType) | |
1465 | }; | |
c017a39f | 1466 | const __m128i v0 = sse_cast<__m128i>(d.v()[0]); |
1467 | const __m128i v1 = sse_cast<__m128i>(d.v()[1]); | |
1468 | switch (static_cast<unsigned int>(amount) % Size) { | |
1469 | case 0: return *this; | |
79c86c14 | 1470 | case 1: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof))); |
1471 | case 2: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof))); | |
1472 | case 3: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof))); | |
c017a39f | 1473 | case 4: return M256::create(d.v()[1], d.v()[0]); |
79c86c14 | 1474 | case 5: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof))); |
1475 | case 6: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof))); | |
1476 | case 7: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof))); | |
c017a39f | 1477 | } |
1478 | return Zero(); | |
1479 | } | |
1480 | // }}}1 | |
1481 | // sorted specializations {{{1 | |
1482 | template<> inline Vc_PURE uint_v uint_v::sorted() const | |
1483 | { | |
1484 | __m128i x = data(); | |
1485 | __m128i y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)); | |
79c86c14 | 1486 | __m128i l = mm_min_epu32(x, y); |
1487 | __m128i h = mm_max_epu32(x, y); | |
c017a39f | 1488 | x = _mm_unpacklo_epi32(l, h); |
1489 | y = _mm_unpackhi_epi32(h, l); | |
1490 | ||
1491 | // sort quads | |
79c86c14 | 1492 | l = mm_min_epu32(x, y); |
1493 | h = mm_max_epu32(x, y); | |
c017a39f | 1494 | x = _mm_unpacklo_epi32(l, h); |
1495 | y = _mm_unpackhi_epi64(x, x); | |
1496 | ||
79c86c14 | 1497 | l = mm_min_epu32(x, y); |
1498 | h = mm_max_epu32(x, y); | |
c017a39f | 1499 | return _mm_unpacklo_epi32(l, h); |
1500 | } | |
1501 | template<> inline Vc_PURE ushort_v ushort_v::sorted() const | |
1502 | { | |
1503 | __m128i lo, hi, y, x = data(); | |
1504 | // sort pairs | |
1505 | y = Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(x); | |
79c86c14 | 1506 | lo = mm_min_epu16(x, y); |
1507 | hi = mm_max_epu16(x, y); | |
1508 | x = mm_blend_epi16(lo, hi, 0xaa); | |
c017a39f | 1509 | |
1510 | // merge left and right quads | |
1511 | y = Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(x); | |
79c86c14 | 1512 | lo = mm_min_epu16(x, y); |
1513 | hi = mm_max_epu16(x, y); | |
1514 | x = mm_blend_epi16(lo, hi, 0xcc); | |
c017a39f | 1515 | y = _mm_srli_si128(x, 2); |
79c86c14 | 1516 | lo = mm_min_epu16(x, y); |
1517 | hi = mm_max_epu16(x, y); | |
1518 | x = mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); | |
c017a39f | 1519 | |
1520 | // merge quads into octs | |
1521 | y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); | |
1522 | y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); | |
79c86c14 | 1523 | lo = mm_min_epu16(x, y); |
1524 | hi = mm_max_epu16(x, y); | |
c017a39f | 1525 | |
1526 | x = _mm_unpacklo_epi16(lo, hi); | |
1527 | y = _mm_srli_si128(x, 8); | |
79c86c14 | 1528 | lo = mm_min_epu16(x, y); |
1529 | hi = mm_max_epu16(x, y); | |
c017a39f | 1530 | |
1531 | x = _mm_unpacklo_epi16(lo, hi); | |
1532 | y = _mm_srli_si128(x, 8); | |
79c86c14 | 1533 | lo = mm_min_epu16(x, y); |
1534 | hi = mm_max_epu16(x, y); | |
c017a39f | 1535 | |
1536 | return _mm_unpacklo_epi16(lo, hi); | |
1537 | } | |
f22341db | 1538 | // }}}1 |
1539 | } // namespace SSE | |
1540 | } // namespace Vc | |
c017a39f | 1541 | } // namespace AliRoot |
f22341db | 1542 | |
1543 | #include "undomacros.h" | |
1544 | ||
1545 | // vim: foldmethod=marker |