]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/include/Vc/sse/vector.tcc
Vc: update to 0.7.4 release
[u/mrichter/AliRoot.git] / Vc / include / Vc / sse / vector.tcc
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2010-2012 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
20#include "limits.h"
21#include "../common/bitscanintrinsics.h"
22#include "macros.h"
23
c017a39f 24namespace AliRoot {
f22341db 25namespace Vc
26{
27ALIGN(64) extern unsigned int RandomState[16];
28
29namespace SSE
30{
31
c017a39f 32template<typename T, int Size> static Vc_ALWAYS_INLINE Vc_CONST const T *_IndexesFromZero() {
f22341db 33 if (Size == 4) {
34 return reinterpret_cast<const T *>(_IndexesFromZero4);
35 } else if (Size == 8) {
36 return reinterpret_cast<const T *>(_IndexesFromZero8);
37 } else if (Size == 16) {
38 return reinterpret_cast<const T *>(_IndexesFromZero16);
39 }
40 return 0;
41}
42
43///////////////////////////////////////////////////////////////////////////////////////////
44// constants {{{1
c017a39f 45template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerZero::ZEnum)
f22341db 46 : d(VectorHelper<VectorType>::zero())
47{
48}
49
c017a39f 50template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerOne::OEnum)
f22341db 51 : d(VectorHelper<T>::one())
52{
53}
54
c017a39f 55template<typename T> Vc_INTRINSIC Vector<T>::Vector(VectorSpecialInitializerIndexesFromZero::IEnum)
f22341db 56 : d(VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned))
57{
58}
59
c017a39f 60template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::Zero()
f22341db 61{
62 return VectorHelper<VectorType>::zero();
63}
64
c017a39f 65template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::One()
f22341db 66{
67 return VectorHelper<T>::one();
68}
69
c017a39f 70template<typename T> Vc_INTRINSIC Vc_CONST Vector<T> Vector<T>::IndexesFromZero()
f22341db 71{
72 return VectorHelper<VectorType>::load(_IndexesFromZero<EntryType, Size>(), Aligned);
73}
74
75// conversion/casts {{{1
c017a39f 76template<typename T> template<typename OtherT> Vc_INTRINSIC Vector<T>::Vector(const Vector<OtherT> &x)
f22341db 77 : d(StaticCastHelper<OtherT, T>::cast(x.data()))
78{
79}
80
c017a39f 81template<> template<> Vc_INTRINSIC short_v &Vector<short>::operator=(const ushort_v &x) {
f22341db 82 data() = StaticCastHelper<unsigned short, short>::cast(x.data()); return *this;
83}
c017a39f 84template<> template<> Vc_INTRINSIC ushort_v &Vector<unsigned short>::operator=(const short_v &x) {
f22341db 85 data() = StaticCastHelper<short, unsigned short>::cast(x.data()); return *this;
86}
c017a39f 87template<> template<> Vc_INTRINSIC int_v &Vector<int>::operator=(const uint_v &x) {
f22341db 88 data() = StaticCastHelper<unsigned int, int>::cast(x.data()); return *this;
89}
c017a39f 90template<> template<> Vc_INTRINSIC uint_v &Vector<unsigned int>::operator=(const int_v &x) {
f22341db 91 data() = StaticCastHelper<int, unsigned int>::cast(x.data()); return *this;
92}
93
94// broadcasts {{{1
c017a39f 95template<typename T> Vc_INTRINSIC Vector<T>::Vector(EntryType a)
f22341db 96 : d(VectorHelper<T>::set(a))
97{
98}
99
100///////////////////////////////////////////////////////////////////////////////////////////
101// load ctors {{{1
c017a39f 102template<typename T> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x) { load(x); }
103template<typename T> template<typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *x, A a) { load(x, a); }
104template<typename T> template<typename OtherT> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x) { load(x); }
105template<typename T> template<typename OtherT, typename A> Vc_ALWAYS_INLINE Vector<T>::Vector(const OtherT *x, A a) { load(x, a); }
f22341db 106
107///////////////////////////////////////////////////////////////////////////////////////////
108// load member functions {{{1
c017a39f 109template<typename T> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem)
f22341db 110{
111 load(mem, Aligned);
112}
113
c017a39f 114template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::load(const EntryType *mem, A align)
f22341db 115{
116 d.v() = VectorHelper<VectorType>::load(mem, align);
117}
118
c017a39f 119template<typename T> template<typename OtherT> Vc_INTRINSIC void Vector<T>::load(const OtherT *mem)
f22341db 120{
121 load(mem, Aligned);
122}
123
124// float8: simply use the float implementation twice {{{2
c017a39f 125template<> template<typename OtherT, typename A> Vc_INTRINSIC void Vector<float8>::load(const OtherT *x, A a)
f22341db 126{
127 d.v() = M256::create(
128 Vector<float>(&x[0], a).data(),
129 Vector<float>(&x[4], a).data()
130 );
131}
132
133// LoadHelper {{{2
134template<typename DstT, typename SrcT, typename Flags> struct LoadHelper;
135
136// float {{{2
137template<typename Flags> struct LoadHelper<float, double, Flags> {
c017a39f 138 static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const double *mem, Flags f)
f22341db 139 {
140 return _mm_movelh_ps(_mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[0], f)),
141 _mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[2], f)));
142 }
143};
144template<typename Flags> struct LoadHelper<float, unsigned int, Flags> {
c017a39f 145 static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned int *mem, Flags f)
f22341db 146 {
147 return StaticCastHelper<unsigned int, float>::cast(VectorHelper<__m128i>::load(mem, f));
148 }
149};
150template<typename Flags> struct LoadHelper<float, int, Flags> {
c017a39f 151 static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const int *mem, Flags f)
f22341db 152 {
153 return StaticCastHelper<int, float>::cast(VectorHelper<__m128i>::load(mem, f));
154 }
155};
156template<typename Flags> struct LoadHelper<float, unsigned short, Flags> {
c017a39f 157 static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned short *mem, Flags f)
f22341db 158 {
159 return _mm_cvtepi32_ps(LoadHelper<int, unsigned short, Flags>::load(mem, f));
160 }
161};
162template<typename Flags> struct LoadHelper<float, short, Flags> {
c017a39f 163 static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const short *mem, Flags f)
f22341db 164 {
165 return _mm_cvtepi32_ps(LoadHelper<int, short, Flags>::load(mem, f));
166 }
167};
168template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
c017a39f 169 static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned char *mem, Flags f)
f22341db 170 {
171 return _mm_cvtepi32_ps(LoadHelper<int, unsigned char, Flags>::load(mem, f));
172 }
173};
174template<typename Flags> struct LoadHelper<float, signed char, Flags> {
c017a39f 175 static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const signed char *mem, Flags f)
f22341db 176 {
177 return _mm_cvtepi32_ps(LoadHelper<int, signed char, Flags>::load(mem, f));
178 }
179};
180
181// int {{{2
182template<typename Flags> struct LoadHelper<int, unsigned int, Flags> {
c017a39f 183 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned int *mem, Flags f)
f22341db 184 {
185 return VectorHelper<__m128i>::load(mem, f);
186 }
187};
188// no difference between streaming and alignment, because the
189// 32/64 bit loads are not available as streaming loads, and can always be unaligned
190template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
c017a39f 191 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags)
f22341db 192 {
79c86c14 193 return mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
f22341db 194 }
195};
196template<typename Flags> struct LoadHelper<int, short, Flags> {
c017a39f 197 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const short *mem, Flags)
f22341db 198 {
79c86c14 199 return mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
f22341db 200 }
201};
202template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
c017a39f 203 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
f22341db 204 {
79c86c14 205 return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
f22341db 206 }
207};
208template<typename Flags> struct LoadHelper<int, signed char, Flags> {
c017a39f 209 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags)
f22341db 210 {
79c86c14 211 return mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
f22341db 212 }
213};
214
215// unsigned int {{{2
216template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
c017a39f 217 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags)
f22341db 218 {
79c86c14 219 return mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
f22341db 220 }
221};
222template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
c017a39f 223 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
f22341db 224 {
79c86c14 225 return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
f22341db 226 }
227};
228
229// short {{{2
230template<typename Flags> struct LoadHelper<short, unsigned short, Flags> {
c017a39f 231 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags f)
f22341db 232 {
233 return VectorHelper<__m128i>::load(mem, f);
234 }
235};
236template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
c017a39f 237 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
f22341db 238 {
79c86c14 239 return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
f22341db 240 }
241};
242template<typename Flags> struct LoadHelper<short, signed char, Flags> {
c017a39f 243 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags)
f22341db 244 {
79c86c14 245 return mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
f22341db 246 }
247};
248
249// unsigned short {{{2
250template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
c017a39f 251 static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
f22341db 252 {
79c86c14 253 return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
f22341db 254 }
255};
256
257// general load, implemented via LoadHelper {{{2
c017a39f 258template<typename DstT> template<typename SrcT, typename Flags> Vc_INTRINSIC void Vector<DstT>::load(const SrcT *x, Flags f)
f22341db 259{
260 d.v() = LoadHelper<DstT, SrcT, Flags>::load(x, f);
261}
262
263///////////////////////////////////////////////////////////////////////////////////////////
264// expand/combine {{{1
c017a39f 265template<typename T> Vc_INTRINSIC Vector<T>::Vector(const Vector<typename CtorTypeHelper<T>::Type> *a)
f22341db 266 : d(VectorHelper<T>::concat(a[0].data(), a[1].data()))
267{
268}
269
270template<typename T> inline void Vector<T>::expand(Vector<typename ExpandTypeHelper<T>::Type> *x) const
271{
272 if (Size == 8u) {
273 x[0].data() = VectorHelper<T>::expand0(data());
274 x[1].data() = VectorHelper<T>::expand1(data());
275 }
276}
277
278///////////////////////////////////////////////////////////////////////////////////////////
279// zeroing {{{1
c017a39f 280template<typename T> Vc_INTRINSIC void Vector<T>::setZero()
f22341db 281{
282 data() = VectorHelper<VectorType>::zero();
283}
284
c017a39f 285template<typename T> Vc_INTRINSIC void Vector<T>::setZero(const Mask &k)
f22341db 286{
287 data() = VectorHelper<VectorType>::andnot_(mm128_reinterpret_cast<VectorType>(k.data()), data());
288}
289
c017a39f 290template<> Vc_INTRINSIC void Vector<double>::setQnan()
f22341db 291{
292 data() = _mm_setallone_pd();
293}
c017a39f 294template<> Vc_INTRINSIC void Vector<double>::setQnan(Mask::Argument k)
f22341db 295{
296 data() = _mm_or_pd(data(), k.dataD());
297}
c017a39f 298template<> Vc_INTRINSIC void Vector<float>::setQnan()
f22341db 299{
300 data() = _mm_setallone_ps();
301}
c017a39f 302template<> Vc_INTRINSIC void Vector<float>::setQnan(Mask::Argument k)
f22341db 303{
304 data() = _mm_or_ps(data(), k.data());
305}
c017a39f 306template<> Vc_INTRINSIC void Vector<float8>::setQnan()
f22341db 307{
308 d.v()[0] = _mm_setallone_ps();
309 d.v()[1] = _mm_setallone_ps();
310}
c017a39f 311template<> Vc_INTRINSIC void Vector<float8>::setQnan(Mask::Argument k)
f22341db 312{
313 d.v()[0] = _mm_or_ps(d.v()[0], k.data()[0]);
314 d.v()[1] = _mm_or_ps(d.v()[1], k.data()[1]);
315}
316
317///////////////////////////////////////////////////////////////////////////////////////////
318// stores {{{1
c017a39f 319template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem) const
f22341db 320{
321 VectorHelper<VectorType>::store(mem, data(), Aligned);
322}
323
c017a39f 324template<typename T> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask) const
f22341db 325{
326 VectorHelper<VectorType>::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), Aligned);
327}
328
c017a39f 329template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, A align) const
f22341db 330{
331 VectorHelper<VectorType>::store(mem, data(), align);
332}
333
c017a39f 334template<typename T> template<typename A> Vc_INTRINSIC void Vector<T>::store(EntryType *mem, const Mask &mask, A align) const
f22341db 335{
336 HV::store(mem, data(), mm128_reinterpret_cast<VectorType>(mask.data()), align);
337}
338
339///////////////////////////////////////////////////////////////////////////////////////////
340// division {{{1
c017a39f 341template<typename T> Vc_INTRINSIC Vector<T> &WriteMaskedVector<T>::operator/=(const Vector<T> &x)
f22341db 342{
343 return operator=(*vec / x);
344}
c017a39f 345template<> Vc_INTRINSIC int_v &WriteMaskedVector<int>::operator/=(const int_v &x)
f22341db 346{
347 Vc_foreach_bit (int i, mask) {
348 vec->d.m(i) /= x.d.m(i);
349 }
350 return *vec;
351}
c017a39f 352template<> Vc_INTRINSIC uint_v &WriteMaskedVector<unsigned int>::operator/=(const uint_v &x)
f22341db 353{
354 Vc_foreach_bit (int i, mask) {
355 vec->d.m(i) /= x.d.m(i);
356 }
357 return *vec;
358}
c017a39f 359template<> Vc_INTRINSIC short_v &WriteMaskedVector<short>::operator/=(const short_v &x)
f22341db 360{
361 Vc_foreach_bit (int i, mask) {
362 vec->d.m(i) /= x.d.m(i);
363 }
364 return *vec;
365}
c017a39f 366template<> Vc_INTRINSIC ushort_v &WriteMaskedVector<unsigned short>::operator/=(const ushort_v &x)
f22341db 367{
368 Vc_foreach_bit (int i, mask) {
369 vec->d.m(i) /= x.d.m(i);
370 }
371 return *vec;
372}
373
374template<typename T> inline Vector<T> &Vector<T>::operator/=(EntryType x)
375{
376 if (VectorTraits<T>::HasVectorDivision) {
377 return operator/=(Vector<T>(x));
378 }
379 for_all_vector_entries(i,
380 d.m(i) /= x;
381 );
382 return *this;
383}
384
c017a39f 385template<typename T> template<typename TT> Vc_INTRINSIC Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType<T>::Type, Vector<T>) Vector<T>::operator/(TT x) const
f22341db 386{
387 if (VectorTraits<T>::HasVectorDivision) {
388 return operator/(Vector<T>(x));
389 }
390 Vector<T> r;
391 for_all_vector_entries(i,
392 r.d.m(i) = d.m(i) / x;
393 );
394 return r;
395}
396
397template<typename T> inline Vector<T> &Vector<T>::operator/=(const Vector<T> &x)
398{
399 for_all_vector_entries(i,
400 d.m(i) /= x.d.m(i);
401 );
402 return *this;
403}
404
c017a39f 405template<typename T> inline Vc_PURE Vector<T> Vector<T>::operator/(const Vector<T> &x) const
f22341db 406{
407 Vector<T> r;
408 for_all_vector_entries(i,
409 r.d.m(i) = d.m(i) / x.d.m(i);
410 );
411 return r;
412}
413
414template<> inline Vector<short> &Vector<short>::operator/=(const Vector<short> &x)
415{
416 __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
417 __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
418 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
419 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
420 d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
421 return *this;
422}
423
c017a39f 424template<> inline Vc_PURE Vector<short> Vector<short>::operator/(const Vector<short> &x) const
f22341db 425{
426 __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
427 __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
428 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
429 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
430 return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
431}
432
433template<> inline Vector<unsigned short> &Vector<unsigned short>::operator/=(const Vector<unsigned short> &x)
434{
435 __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
436 __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
437 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
438 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
439 d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
440 return *this;
441}
442
c017a39f 443template<> Vc_ALWAYS_INLINE Vc_PURE Vector<unsigned short> Vector<unsigned short>::operator/(const Vector<unsigned short> &x) const
f22341db 444{
445 __m128 lo = _mm_cvtepi32_ps(VectorHelper<short>::expand0(d.v()));
446 __m128 hi = _mm_cvtepi32_ps(VectorHelper<short>::expand1(d.v()));
447 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper<short>::expand0(x.d.v())));
448 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper<short>::expand1(x.d.v())));
449 return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi));
450}
451
c017a39f 452template<> Vc_ALWAYS_INLINE Vector<float> &Vector<float>::operator/=(const Vector<float> &x)
f22341db 453{
454 d.v() = _mm_div_ps(d.v(), x.d.v());
455 return *this;
456}
457
c017a39f 458template<> Vc_ALWAYS_INLINE Vc_PURE Vector<float> Vector<float>::operator/(const Vector<float> &x) const
f22341db 459{
460 return _mm_div_ps(d.v(), x.d.v());
461}
462
c017a39f 463template<> Vc_ALWAYS_INLINE Vector<float8> &Vector<float8>::operator/=(const Vector<float8> &x)
f22341db 464{
465 d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]);
466 d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]);
467 return *this;
468}
469
c017a39f 470template<> Vc_ALWAYS_INLINE Vc_PURE Vector<float8> Vector<float8>::operator/(const Vector<float8> &x) const
f22341db 471{
472 Vector<float8> r;
473 r.d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]);
474 r.d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]);
475 return r;
476}
477
c017a39f 478template<> Vc_ALWAYS_INLINE Vector<double> &Vector<double>::operator/=(const Vector<double> &x)
f22341db 479{
480 d.v() = _mm_div_pd(d.v(), x.d.v());
481 return *this;
482}
483
c017a39f 484template<> Vc_ALWAYS_INLINE Vc_PURE Vector<double> Vector<double>::operator/(const Vector<double> &x) const
f22341db 485{
486 return _mm_div_pd(d.v(), x.d.v());
487}
488
489///////////////////////////////////////////////////////////////////////////////////////////
490// operator- {{{1
c017a39f 491template<> Vc_ALWAYS_INLINE Vector<double> Vc_PURE Vc_FLATTEN Vector<double>::operator-() const
f22341db 492{
493 return _mm_xor_pd(d.v(), _mm_setsignmask_pd());
494}
c017a39f 495template<> Vc_ALWAYS_INLINE Vector<float> Vc_PURE Vc_FLATTEN Vector<float>::operator-() const
f22341db 496{
497 return _mm_xor_ps(d.v(), _mm_setsignmask_ps());
498}
c017a39f 499template<> Vc_ALWAYS_INLINE Vector<float8> Vc_PURE Vc_FLATTEN Vector<float8>::operator-() const
f22341db 500{
501 return M256::create(
502 _mm_xor_ps(d.v()[0], _mm_setsignmask_ps()),
503 _mm_xor_ps(d.v()[1], _mm_setsignmask_ps()));
504}
c017a39f 505template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<int>::operator-() const
f22341db 506{
507#ifdef VC_IMPL_SSSE3
508 return _mm_sign_epi32(d.v(), _mm_setallone_si128());
509#else
510 return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32());
511#endif
512}
c017a39f 513template<> Vc_ALWAYS_INLINE Vector<int> Vc_PURE Vc_FLATTEN Vector<unsigned int>::operator-() const
f22341db 514{
515#ifdef VC_IMPL_SSSE3
516 return _mm_sign_epi32(d.v(), _mm_setallone_si128());
517#else
518 return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32());
519#endif
520}
c017a39f 521template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<short>::operator-() const
f22341db 522{
523#ifdef VC_IMPL_SSSE3
524 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
525#else
526 return _mm_mullo_epi16(d.v(), _mm_setallone_si128());
527#endif
528}
c017a39f 529template<> Vc_ALWAYS_INLINE Vector<short> Vc_PURE Vc_FLATTEN Vector<unsigned short>::operator-() const
f22341db 530{
531#ifdef VC_IMPL_SSSE3
532 return _mm_sign_epi16(d.v(), _mm_setallone_si128());
533#else
534 return _mm_mullo_epi16(d.v(), _mm_setallone_si128());
535#endif
536}
537
538///////////////////////////////////////////////////////////////////////////////////////////
539// integer ops {{{1
540#define OP_IMPL(T, symbol, fun) \
c017a39f 541template<> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator symbol##=(const Vector<T> &x) \
f22341db 542{ \
543 d.v() = VectorHelper<T>::fun(d.v(), x.d.v()); \
544 return *this; \
545} \
c017a39f 546template<> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator symbol(const Vector<T> &x) const \
f22341db 547{ \
548 return VectorHelper<T>::fun(d.v(), x.d.v()); \
549}
550OP_IMPL(int, &, and_)
551OP_IMPL(int, |, or_)
552OP_IMPL(int, ^, xor_)
553OP_IMPL(unsigned int, &, and_)
554OP_IMPL(unsigned int, |, or_)
555OP_IMPL(unsigned int, ^, xor_)
556OP_IMPL(short, &, and_)
557OP_IMPL(short, |, or_)
558OP_IMPL(short, ^, xor_)
559OP_IMPL(unsigned short, &, and_)
560OP_IMPL(unsigned short, |, or_)
561OP_IMPL(unsigned short, ^, xor_)
562OP_IMPL(float, &, and_)
563OP_IMPL(float, |, or_)
564OP_IMPL(float, ^, xor_)
565OP_IMPL(float8, &, and_)
566OP_IMPL(float8, |, or_)
567OP_IMPL(float8, ^, xor_)
568OP_IMPL(double, &, and_)
569OP_IMPL(double, |, or_)
570OP_IMPL(double, ^, xor_)
571#undef OP_IMPL
572
573#ifdef VC_IMPL_XOP
c017a39f 574static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const int_v &value, const int_v &count) { return _mm_sha_epi32(value.data(), count.data()); }
575static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const uint_v &value, const uint_v &count) { return _mm_shl_epi32(value.data(), count.data()); }
576static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const short_v &value, const short_v &count) { return _mm_sha_epi16(value.data(), count.data()); }
577static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const ushort_v &value, const ushort_v &count) { return _mm_shl_epi16(value.data(), count.data()); }
578static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const int_v &value, const int_v &count) { return shiftLeft(value, -count ); }
579static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const uint_v &value, const uint_v &count) { return shiftLeft(value, uint_v(-count)); }
580static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const short_v &value, const short_v &count) { return shiftLeft(value, -count ); }
581static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const ushort_v &value, const ushort_v &count) { return shiftLeft(value, ushort_v(-count)); }
f22341db 582
583#define _VC_OP(T, symbol, impl) \
c017a39f 584template<> Vc_INTRINSIC T &T::operator symbol##=(T::AsArg shift) \
f22341db 585{ \
586 d.v() = impl(*this, shift); \
587 return *this; \
588} \
c017a39f 589template<> Vc_INTRINSIC Vc_PURE T T::operator symbol (T::AsArg shift) const \
f22341db 590{ \
591 return impl(*this, shift); \
592}
593VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, <<, shiftLeft)
594VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, >>, shiftRight)
595#undef _VC_OP
596#else
c017a39f 597#if defined(VC_GCC) && VC_GCC == 0x40600 && defined(VC_IMPL_XOP)
f22341db 598#define VC_WORKAROUND __attribute__((optimize("no-tree-vectorize"),weak))
599#else
c017a39f 600#define VC_WORKAROUND Vc_INTRINSIC
f22341db 601#endif
602
603#define OP_IMPL(T, symbol) \
c017a39f 604template<> VC_WORKAROUND Vector<T> &Vector<T>::operator symbol##=(Vector<T>::AsArg x) \
f22341db 605{ \
606 for_all_vector_entries(i, \
607 d.m(i) symbol##= x.d.m(i); \
608 ); \
609 return *this; \
610} \
c017a39f 611template<> inline Vc_PURE Vector<T> Vector<T>::operator symbol(Vector<T>::AsArg x) const \
f22341db 612{ \
613 Vector<T> r; \
614 for_all_vector_entries(i, \
615 r.d.m(i) = d.m(i) symbol x.d.m(i); \
616 ); \
617 return r; \
618}
619OP_IMPL(int, <<)
620OP_IMPL(int, >>)
621OP_IMPL(unsigned int, <<)
622OP_IMPL(unsigned int, >>)
623OP_IMPL(short, <<)
624OP_IMPL(short, >>)
625OP_IMPL(unsigned short, <<)
626OP_IMPL(unsigned short, >>)
627#undef OP_IMPL
628#undef VC_WORKAROUND
f22341db 629#endif
630
c017a39f 631template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator>>=(int shift) {
f22341db 632 d.v() = VectorHelper<T>::shiftRight(d.v(), shift);
633 return *this;
634}
c017a39f 635template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator>>(int shift) const {
f22341db 636 return VectorHelper<T>::shiftRight(d.v(), shift);
637}
c017a39f 638template<typename T> Vc_ALWAYS_INLINE Vector<T> &Vector<T>::operator<<=(int shift) {
f22341db 639 d.v() = VectorHelper<T>::shiftLeft(d.v(), shift);
640 return *this;
641}
c017a39f 642template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T> Vector<T>::operator<<(int shift) const {
f22341db 643 return VectorHelper<T>::shiftLeft(d.v(), shift);
644}
645
646///////////////////////////////////////////////////////////////////////////////////////////
647// swizzles {{{1
c017a39f 648template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> &Vector<T>::abcd() const { return *this; }
649template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1>(data()); }
650template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2>(data()); }
651template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0>(data()); }
652template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1>(data()); }
653template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2>(data()); }
654template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3>(data()); }
655template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3>(data()); }
656template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0>(data()); }
657template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2>(data()); }
658template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3>(data()); }
659template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0>(data()); }
660template<typename T> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0>(data()); }
661
662template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::cdab() const { return M256::create(Mem::permute<X2, X3, X0, X1>(d.v()[0]), Mem::permute<X2, X3, X0, X1>(d.v()[1])); }
663template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::badc() const { return M256::create(Mem::permute<X1, X0, X3, X2>(d.v()[0]), Mem::permute<X1, X0, X3, X2>(d.v()[1])); }
664template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::aaaa() const { return M256::create(Mem::permute<X0, X0, X0, X0>(d.v()[0]), Mem::permute<X0, X0, X0, X0>(d.v()[1])); }
665template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bbbb() const { return M256::create(Mem::permute<X1, X1, X1, X1>(d.v()[0]), Mem::permute<X1, X1, X1, X1>(d.v()[1])); }
666template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::cccc() const { return M256::create(Mem::permute<X2, X2, X2, X2>(d.v()[0]), Mem::permute<X2, X2, X2, X2>(d.v()[1])); }
667template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dddd() const { return M256::create(Mem::permute<X3, X3, X3, X3>(d.v()[0]), Mem::permute<X3, X3, X3, X3>(d.v()[1])); }
668template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bcad() const { return M256::create(Mem::permute<X1, X2, X0, X3>(d.v()[0]), Mem::permute<X1, X2, X0, X3>(d.v()[1])); }
669template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::bcda() const { return M256::create(Mem::permute<X1, X2, X3, X0>(d.v()[0]), Mem::permute<X1, X2, X3, X0>(d.v()[1])); }
670template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dabc() const { return M256::create(Mem::permute<X3, X0, X1, X2>(d.v()[0]), Mem::permute<X3, X0, X1, X2>(d.v()[1])); }
671template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::acbd() const { return M256::create(Mem::permute<X0, X2, X1, X3>(d.v()[0]), Mem::permute<X0, X2, X1, X3>(d.v()[1])); }
672template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dbca() const { return M256::create(Mem::permute<X3, X1, X2, X0>(d.v()[0]), Mem::permute<X3, X1, X2, X0>(d.v()[1])); }
673template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector<sfloat>::dcba() const { return M256::create(Mem::permute<X3, X2, X1, X0>(d.v()[0]), Mem::permute<X3, X2, X1, X0>(d.v()[1])); }
f22341db 674
675#define VC_SWIZZLES_16BIT_IMPL(T) \
c017a39f 676template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cdab() const { return Mem::permute<X2, X3, X0, X1, X6, X7, X4, X5>(data()); } \
677template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::badc() const { return Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(data()); } \
678template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::aaaa() const { return Mem::permute<X0, X0, X0, X0, X4, X4, X4, X4>(data()); } \
679template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bbbb() const { return Mem::permute<X1, X1, X1, X1, X5, X5, X5, X5>(data()); } \
680template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::cccc() const { return Mem::permute<X2, X2, X2, X2, X6, X6, X6, X6>(data()); } \
681template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dddd() const { return Mem::permute<X3, X3, X3, X3, X7, X7, X7, X7>(data()); } \
682template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcad() const { return Mem::permute<X1, X2, X0, X3, X5, X6, X4, X7>(data()); } \
683template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::bcda() const { return Mem::permute<X1, X2, X3, X0, X5, X6, X7, X4>(data()); } \
684template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dabc() const { return Mem::permute<X3, X0, X1, X2, X7, X4, X5, X6>(data()); } \
685template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::acbd() const { return Mem::permute<X0, X2, X1, X3, X4, X6, X5, X7>(data()); } \
686template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dbca() const { return Mem::permute<X3, X1, X2, X0, X7, X5, X6, X4>(data()); } \
687template<> Vc_INTRINSIC Vc_PURE const Vector<T> Vector<T>::dcba() const { return Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(data()); }
f22341db 688VC_SWIZZLES_16BIT_IMPL(short)
689VC_SWIZZLES_16BIT_IMPL(unsigned short)
690#undef VC_SWIZZLES_16BIT_IMPL
691
692// operators {{{1
693#include "../common/operators.h"
c017a39f 694// isNegative {{{1
695template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const
696{
697 return sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v())), 31));
698}
699template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const
700{
701 return M256::create(
702 sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[0])), 31)),
703 sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[1])), 31))
704 );
705}
706template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const
707{
708 return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(
709 _mm_srai_epi32(sse_cast<__m128i>(_mm_and_pd(_mm_setsignmask_pd(), d.v())), 31)
710 ));
711}
f22341db 712// gathers {{{1
c017a39f 713template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes)
f22341db 714{
715 gather(mem, indexes);
716}
c017a39f 717template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes)
f22341db 718{
719 gather(mem, indexes);
720}
721
c017a39f 722template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask)
f22341db 723 : d(HT::zero())
724{
725 gather(mem, indexes, mask);
726}
727
c017a39f 728template<typename T> template<typename IndexT> Vc_ALWAYS_INLINE Vector<T>::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IndexT>) indexes, MaskArg mask)
f22341db 729 : d(HT::zero())
730{
731 gather(mem, indexes, mask);
732}
733
c017a39f 734template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 735{
736 gather(array, member1, indexes);
737}
c017a39f 738template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
f22341db 739 : d(HT::zero())
740{
741 gather(array, member1, indexes, mask);
742}
c017a39f 743template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 744{
745 gather(array, member1, member2, indexes);
746}
c017a39f 747template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
f22341db 748 : d(HT::zero())
749{
750 gather(array, member1, member2, indexes, mask);
751}
c017a39f 752template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 753{
754 gather(array, ptrMember1, outerIndexes, innerIndexes);
755}
c017a39f 756template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE Vector<T>::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask)
f22341db 757 : d(HT::zero())
758{
759 gather(array, ptrMember1, outerIndexes, innerIndexes, mask);
760}
761
762template<typename T, size_t Size> struct IndexSizeChecker { static void check() {} };
763template<typename T, size_t Size> struct IndexSizeChecker<Vector<T>, Size>
764{
765 static void check() {
766 VC_STATIC_ASSERT(Vector<T>::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries);
767 }
768};
c017a39f 769template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 770{
771 IndexSizeChecker<Index, Size>::check();
772 d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]);
773}
c017a39f 774template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 775{
776 IndexSizeChecker<Index, Size>::check();
777 d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
778}
c017a39f 779template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 780{
781 IndexSizeChecker<Index, Size>::check();
782 d.v()[0] = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
783 d.v()[1] = _mm_setr_ps(mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
784}
c017a39f 785template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 786{
787 IndexSizeChecker<Index, Size>::check();
788 d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
789}
c017a39f 790template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 791{
792 IndexSizeChecker<Index, Size>::check();
793 d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
794}
c017a39f 795template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 796{
797 IndexSizeChecker<Index, Size>::check();
798 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
799 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
800}
c017a39f 801template<> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes)
f22341db 802{
803 IndexSizeChecker<Index, Size>::check();
804 d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]],
805 mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]);
806}
807
808#ifdef VC_USE_SET_GATHERS
c017a39f 809template<typename T> template<typename IT> Vc_ALWAYS_INLINE void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector<IT>) indexes, MaskArg mask)
f22341db 810{
811 IndexSizeChecker<Vector<IT>, Size>::check();
c017a39f 812 Vector<IT> indexesTmp = indexes;
813 indexesTmp.setZero(!static_cast<typename Vector<IT>::Mask>(mask));
814 (*this)(mask) = Vector<T>(mem, indexesTmp);
f22341db 815}
816#endif
817
818#ifdef VC_USE_BSF_GATHERS
819#define VC_MASKED_GATHER \
820 int bits = mask.toInt(); \
821 while (bits) { \
822 const int i = _bit_scan_forward(bits); \
823 bits &= ~(1 << i); /* btr? */ \
824 d.m(i) = ith_value(i); \
825 }
826#elif defined(VC_USE_POPCNT_BSF_GATHERS)
827#define VC_MASKED_GATHER \
828 unsigned int bits = mask.toInt(); \
829 unsigned int low, high = 0; \
830 switch (mask.count()) { \
831 case 8: \
832 high = _bit_scan_reverse(bits); \
833 d.m(high) = ith_value(high); \
834 high = (1 << high); \
835 case 7: \
836 low = _bit_scan_forward(bits); \
837 bits ^= high | (1 << low); \
838 d.m(low) = ith_value(low); \
839 case 6: \
840 high = _bit_scan_reverse(bits); \
841 d.m(high) = ith_value(high); \
842 high = (1 << high); \
843 case 5: \
844 low = _bit_scan_forward(bits); \
845 bits ^= high | (1 << low); \
846 d.m(low) = ith_value(low); \
847 case 4: \
848 high = _bit_scan_reverse(bits); \
849 d.m(high) = ith_value(high); \
850 high = (1 << high); \
851 case 3: \
852 low = _bit_scan_forward(bits); \
853 bits ^= high | (1 << low); \
854 d.m(low) = ith_value(low); \
855 case 2: \
856 high = _bit_scan_reverse(bits); \
857 d.m(high) = ith_value(high); \
858 case 1: \
859 low = _bit_scan_forward(bits); \
860 d.m(low) = ith_value(low); \
861 case 0: \
862 break; \
863 }
864#else
865#define VC_MASKED_GATHER \
866 if (mask.isEmpty()) { \
867 return; \
868 } \
869 for_all_vector_entries(i, \
870 if (mask[i]) d.m(i) = ith_value(i); \
871 );
872#endif
873
874template<typename T> template<typename Index>
c017a39f 875Vc_INTRINSIC void Vector<T>::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask)
f22341db 876{
877 IndexSizeChecker<Index, Size>::check();
878#define ith_value(_i_) (mem[indexes[_i_]])
879 VC_MASKED_GATHER
880#undef ith_value
881}
882
883template<> template<typename S1, typename IT>
c017a39f 884Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 885{
886 IndexSizeChecker<IT, Size>::check();
887 d.v() = _mm_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1));
888}
889template<> template<typename S1, typename IT>
c017a39f 890Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 891{
892 IndexSizeChecker<IT, Size>::check();
893 d.v() = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
894 array[indexes[3]].*(member1));
895}
896template<> template<typename S1, typename IT>
c017a39f 897Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 898{
899 IndexSizeChecker<IT, Size>::check();
900 d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
901 array[indexes[3]].*(member1));
902 d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1),
903 array[indexes[7]].*(member1));
904}
905template<> template<typename S1, typename IT>
c017a39f 906Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 907{
908 IndexSizeChecker<IT, Size>::check();
909 d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
910 array[indexes[3]].*(member1));
911}
912template<> template<typename S1, typename IT>
c017a39f 913Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 914{
915 IndexSizeChecker<IT, Size>::check();
916 d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
917 array[indexes[3]].*(member1));
918}
919template<> template<typename S1, typename IT>
c017a39f 920Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 921{
922 IndexSizeChecker<IT, Size>::check();
923 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
924 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
925 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
926}
927template<> template<typename S1, typename IT>
c017a39f 928Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 929{
930 IndexSizeChecker<IT, Size>::check();
931 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1),
932 array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1),
933 array[indexes[6]].*(member1), array[indexes[7]].*(member1));
934}
935template<typename T> template<typename S1, typename IT>
c017a39f 936Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
f22341db 937{
938 IndexSizeChecker<IT, Size>::check();
939#define ith_value(_i_) (array[indexes[_i_]].*(member1))
940 VC_MASKED_GATHER
941#undef ith_value
942}
943template<> template<typename S1, typename S2, typename IT>
c017a39f 944Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 945{
946 IndexSizeChecker<IT, Size>::check();
947 d.v() = _mm_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2));
948}
949template<> template<typename S1, typename S2, typename IT>
c017a39f 950Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 951{
952 IndexSizeChecker<IT, Size>::check();
953 d.v() = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
954 array[indexes[3]].*(member1).*(member2));
955}
956template<> template<typename S1, typename S2, typename IT>
c017a39f 957Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 958{
959 IndexSizeChecker<IT, Size>::check();
960 d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
961 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
962 d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
963 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
964}
965template<> template<typename S1, typename S2, typename IT>
c017a39f 966Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 967{
968 IndexSizeChecker<IT, Size>::check();
969 d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
970 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
971}
972template<> template<typename S1, typename S2, typename IT>
c017a39f 973Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 974{
975 IndexSizeChecker<IT, Size>::check();
976 d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2),
977 array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2));
978}
979template<> template<typename S1, typename S2, typename IT>
c017a39f 980Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 981{
982 IndexSizeChecker<IT, Size>::check();
983 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
984 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
985 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
986}
987template<> template<typename S1, typename S2, typename IT>
c017a39f 988Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes)
f22341db 989{
990 IndexSizeChecker<IT, Size>::check();
991 d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2),
992 array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2),
993 array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2));
994}
995template<typename T> template<typename S1, typename S2, typename IT>
c017a39f 996Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask)
f22341db 997{
998 IndexSizeChecker<IT, Size>::check();
999#define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2))
1000 VC_MASKED_GATHER
1001#undef ith_value
1002}
1003template<> template<typename S1, typename IT1, typename IT2>
c017a39f 1004Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<double>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 1005{
1006 IndexSizeChecker<IT1, Size>::check();
1007 IndexSizeChecker<IT2, Size>::check();
1008 d.v() = _mm_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]],
1009 (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]]);
1010}
1011template<> template<typename S1, typename IT1, typename IT2>
c017a39f 1012Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 1013{
1014 IndexSizeChecker<IT1, Size>::check();
1015 IndexSizeChecker<IT2, Size>::check();
1016 d.v() = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1017 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1018}
1019template<> template<typename S1, typename IT1, typename IT2>
c017a39f 1020Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<float8>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 1021{
1022 IndexSizeChecker<IT1, Size>::check();
1023 IndexSizeChecker<IT2, Size>::check();
1024 d.v()[0] = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1025 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1026 d.v()[1] = _mm_setr_ps((array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1027 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1028}
1029template<> template<typename S1, typename IT1, typename IT2>
c017a39f 1030Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 1031{
1032 IndexSizeChecker<IT1, Size>::check();
1033 IndexSizeChecker<IT2, Size>::check();
1034 d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1035 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1036}
1037template<> template<typename S1, typename IT1, typename IT2>
c017a39f 1038Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned int>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 1039{
1040 IndexSizeChecker<IT1, Size>::check();
1041 IndexSizeChecker<IT2, Size>::check();
1042 d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1043 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]);
1044}
1045template<> template<typename S1, typename IT1, typename IT2>
c017a39f 1046Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 1047{
1048 IndexSizeChecker<IT1, Size>::check();
1049 IndexSizeChecker<IT2, Size>::check();
1050 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1051 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
1052 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1053 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1054}
1055template<> template<typename S1, typename IT1, typename IT2>
c017a39f 1056Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<unsigned short>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes)
f22341db 1057{
1058 IndexSizeChecker<IT1, Size>::check();
1059 IndexSizeChecker<IT2, Size>::check();
1060 d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]],
1061 (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]],
1062 (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]],
1063 (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]);
1064}
1065template<typename T> template<typename S1, typename IT1, typename IT2>
c017a39f 1066Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask)
f22341db 1067{
1068 IndexSizeChecker<IT1, Size>::check();
1069 IndexSizeChecker<IT2, Size>::check();
1070#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1071 VC_MASKED_GATHER
1072#undef ith_value
1073}
1074// scatters {{{1
1075#undef VC_MASKED_GATHER
1076#ifdef VC_USE_BSF_SCATTERS
1077#define VC_MASKED_SCATTER \
1078 int bits = mask.toInt(); \
1079 while (bits) { \
1080 const int i = _bit_scan_forward(bits); \
1081 bits ^= (1 << i); /* btr? */ \
1082 ith_value(i) = d.m(i); \
1083 }
1084#elif defined(VC_USE_POPCNT_BSF_SCATTERS)
1085#define VC_MASKED_SCATTER \
1086 unsigned int bits = mask.toInt(); \
1087 unsigned int low, high = 0; \
1088 switch (mask.count()) { \
1089 case 8: \
1090 high = _bit_scan_reverse(bits); \
1091 ith_value(high) = d.m(high); \
1092 high = (1 << high); \
1093 case 7: \
1094 low = _bit_scan_forward(bits); \
1095 bits ^= high | (1 << low); \
1096 ith_value(low) = d.m(low); \
1097 case 6: \
1098 high = _bit_scan_reverse(bits); \
1099 ith_value(high) = d.m(high); \
1100 high = (1 << high); \
1101 case 5: \
1102 low = _bit_scan_forward(bits); \
1103 bits ^= high | (1 << low); \
1104 ith_value(low) = d.m(low); \
1105 case 4: \
1106 high = _bit_scan_reverse(bits); \
1107 ith_value(high) = d.m(high); \
1108 high = (1 << high); \
1109 case 3: \
1110 low = _bit_scan_forward(bits); \
1111 bits ^= high | (1 << low); \
1112 ith_value(low) = d.m(low); \
1113 case 2: \
1114 high = _bit_scan_reverse(bits); \
1115 ith_value(high) = d.m(high); \
1116 case 1: \
1117 low = _bit_scan_forward(bits); \
1118 ith_value(low) = d.m(low); \
1119 case 0: \
1120 break; \
1121 }
1122#else
1123#define VC_MASKED_SCATTER \
1124 if (mask.isEmpty()) { \
1125 return; \
1126 } \
1127 for_all_vector_entries(i, \
1128 if (mask[i]) ith_value(i) = d.m(i); \
1129 );
1130#endif
1131
c017a39f 1132template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const
f22341db 1133{
1134 for_all_vector_entries(i,
1135 mem[indexes[i]] = d.m(i);
1136 );
1137}
c017a39f 1138template<typename T> template<typename Index> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const
f22341db 1139{
1140#define ith_value(_i_) mem[indexes[_i_]]
1141 VC_MASKED_SCATTER
1142#undef ith_value
1143}
c017a39f 1144template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const
f22341db 1145{
1146 for_all_vector_entries(i,
1147 array[indexes[i]].*(member1) = d.m(i);
1148 );
1149}
c017a39f 1150template<typename T> template<typename S1, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const
f22341db 1151{
1152#define ith_value(_i_) array[indexes[_i_]].*(member1)
1153 VC_MASKED_SCATTER
1154#undef ith_value
1155}
c017a39f 1156template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const
f22341db 1157{
1158 for_all_vector_entries(i,
1159 array[indexes[i]].*(member1).*(member2) = d.m(i);
1160 );
1161}
c017a39f 1162template<typename T> template<typename S1, typename S2, typename IT> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const
f22341db 1163{
1164#define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2)
1165 VC_MASKED_SCATTER
1166#undef ith_value
1167}
c017a39f 1168template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const
f22341db 1169{
1170 for_all_vector_entries(i,
1171 (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i);
1172 );
1173}
c017a39f 1174template<typename T> template<typename S1, typename IT1, typename IT2> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector<T>::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const
f22341db 1175{
1176#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]]
1177 VC_MASKED_SCATTER
1178#undef ith_value
1179}
1180
1181///////////////////////////////////////////////////////////////////////////////////////////
1182// operator[] {{{1
c017a39f 1183template<typename T> Vc_INTRINSIC typename Vector<T>::EntryType Vc_PURE Vector<T>::operator[](size_t index) const
f22341db 1184{
1185 return d.m(index);
1186}
1187#ifdef VC_GCC
c017a39f 1188template<> Vc_INTRINSIC double Vc_PURE Vector<double>::operator[](size_t index) const
f22341db 1189{
1190 if (__builtin_constant_p(index)) {
1191 return extract_double_imm(d.v(), index);
1192 }
1193 return d.m(index);
1194}
c017a39f 1195template<> Vc_INTRINSIC float Vc_PURE Vector<float>::operator[](size_t index) const
f22341db 1196{
1197 return extract_float(d.v(), index);
1198}
c017a39f 1199template<> Vc_INTRINSIC float Vc_PURE Vector<float8>::operator[](size_t index) const
f22341db 1200{
1201 if (__builtin_constant_p(index)) {
1202 if (index < 4) {
1203 return extract_float_imm(d.v()[0], index);
1204 }
1205 return extract_float_imm(d.v()[1], index - 4);
1206 }
1207 return d.m(index);
1208}
c017a39f 1209template<> Vc_INTRINSIC int Vc_PURE Vector<int>::operator[](size_t index) const
f22341db 1210{
1211 if (__builtin_constant_p(index)) {
1212#if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following
1213#ifdef __x86_64__
1214 if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull;
1215 if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32;
1216#else
1217 if (index == 0) return _mm_cvtsi128_si32(d.v());
1218#endif
1219#endif
1220#ifdef VC_IMPL_SSE4_1
1221 return _mm_extract_epi32(d.v(), index);
1222#else
1223 return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4));
1224#endif
1225 }
1226 return d.m(index);
1227}
c017a39f 1228template<> Vc_INTRINSIC unsigned int Vc_PURE Vector<unsigned int>::operator[](size_t index) const
f22341db 1229{
1230 if (__builtin_constant_p(index)) {
1231#if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following
1232#ifdef __x86_64__
1233 if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull;
1234 if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32;
1235#else
1236 if (index == 0) return _mm_cvtsi128_si32(d.v());
1237#endif
1238#endif
1239#ifdef VC_IMPL_SSE4_1
1240 return _mm_extract_epi32(d.v(), index);
1241#else
1242 return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4));
1243#endif
1244 }
1245 return d.m(index);
1246}
c017a39f 1247template<> Vc_INTRINSIC short Vc_PURE Vector<short>::operator[](size_t index) const
f22341db 1248{
1249 if (__builtin_constant_p(index)) {
1250 return _mm_extract_epi16(d.v(), index);
1251 }
1252 return d.m(index);
1253}
c017a39f 1254template<> Vc_INTRINSIC unsigned short Vc_PURE Vector<unsigned short>::operator[](size_t index) const
f22341db 1255{
1256 if (__builtin_constant_p(index)) {
1257 return _mm_extract_epi16(d.v(), index);
1258 }
1259 return d.m(index);
1260}
1261#endif // GCC
1262///////////////////////////////////////////////////////////////////////////////////////////
1263// horizontal ops {{{1
1264#ifndef VC_IMPL_SSE4_1
1265// without SSE4.1 integer multiplication is slow and we rather multiply the scalars
c017a39f 1266template<> Vc_INTRINSIC Vc_PURE int Vector<int>::product() const
f22341db 1267{
1268 return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
1269}
c017a39f 1270template<> Vc_INTRINSIC Vc_PURE unsigned int Vector<unsigned int>::product() const
f22341db 1271{
1272 return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
1273}
1274#endif
c017a39f 1275template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::min(MaskArg m) const
f22341db 1276{
1277 Vector<T> tmp = std::numeric_limits<Vector<T> >::max();
1278 tmp(m) = *this;
1279 return tmp.min();
1280}
c017a39f 1281template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::max(MaskArg m) const
f22341db 1282{
1283 Vector<T> tmp = std::numeric_limits<Vector<T> >::min();
1284 tmp(m) = *this;
1285 return tmp.max();
1286}
c017a39f 1287template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::product(MaskArg m) const
f22341db 1288{
1289 Vector<T> tmp(VectorSpecialInitializerOne::One);
1290 tmp(m) = *this;
1291 return tmp.product();
1292}
c017a39f 1293template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T>::EntryType Vector<T>::sum(MaskArg m) const
f22341db 1294{
1295 Vector<T> tmp(VectorSpecialInitializerZero::Zero);
1296 tmp(m) = *this;
1297 return tmp.sum();
1298}
1299
1300///////////////////////////////////////////////////////////////////////////////////////////
1301// copySign {{{1
c017a39f 1302template<> Vc_INTRINSIC Vc_PURE Vector<float> Vector<float>::copySign(Vector<float>::AsArg reference) const
f22341db 1303{
1304 return _mm_or_ps(
1305 _mm_and_ps(reference.d.v(), _mm_setsignmask_ps()),
1306 _mm_and_ps(d.v(), _mm_setabsmask_ps())
1307 );
1308}
c017a39f 1309template<> Vc_INTRINSIC Vc_PURE Vector<float8> Vector<float8>::copySign(Vector<float8>::AsArg reference) const
f22341db 1310{
1311 return M256::create( _mm_or_ps(
1312 _mm_and_ps(reference.d.v()[0], _mm_setsignmask_ps()),
1313 _mm_and_ps(d.v()[0], _mm_setabsmask_ps())
1314 ), _mm_or_ps(
1315 _mm_and_ps(reference.d.v()[1], _mm_setsignmask_ps()),
1316 _mm_and_ps(d.v()[1], _mm_setabsmask_ps())
1317 )
1318 );
1319}
c017a39f 1320template<> Vc_INTRINSIC Vc_PURE Vector<double> Vector<double>::copySign(Vector<double>::AsArg reference) const
f22341db 1321{
1322 return _mm_or_pd(
1323 _mm_and_pd(reference.d.v(), _mm_setsignmask_pd()),
1324 _mm_and_pd(d.v(), _mm_setabsmask_pd())
1325 );
1326}//}}}1
1327// exponent {{{1
c017a39f 1328template<> Vc_INTRINSIC Vc_PURE Vector<float> Vector<float>::exponent() const
f22341db 1329{
c017a39f 1330 VC_ASSERT((*this >= 0.f).isFull());
1331 return Internal::exponent(d.v());
f22341db 1332}
c017a39f 1333template<> Vc_INTRINSIC Vc_PURE Vector<float8> Vector<float8>::exponent() const
f22341db 1334{
c017a39f 1335 VC_ASSERT((*this >= 0.f).isFull());
1336 return Internal::exponent(d.v());
f22341db 1337}
c017a39f 1338template<> Vc_INTRINSIC Vc_PURE Vector<double> Vector<double>::exponent() const
f22341db 1339{
c017a39f 1340 VC_ASSERT((*this >= 0.).isFull());
1341 return Internal::exponent(d.v());
f22341db 1342}
1343// }}}1
1344// Random {{{1
c017a39f 1345static void _doRandomStep(Vector<unsigned int> &state0,
f22341db 1346 Vector<unsigned int> &state1)
1347{
1348 state0.load(&Vc::RandomState[0]);
1349 state1.load(&Vc::RandomState[uint_v::Size]);
1350 (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]);
1351 uint_v(_mm_xor_si128((state0 * 0xdeece66du + 11).data(), _mm_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]);
1352}
1353
c017a39f 1354template<typename T> Vc_ALWAYS_INLINE Vector<T> Vector<T>::Random()
f22341db 1355{
1356 Vector<unsigned int> state0, state1;
1357 _doRandomStep(state0, state1);
1358 return state0.reinterpretCast<Vector<T> >();
1359}
1360
c017a39f 1361template<> Vc_ALWAYS_INLINE Vector<float> Vector<float>::Random()
f22341db 1362{
1363 Vector<unsigned int> state0, state1;
1364 _doRandomStep(state0, state1);
1365 return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
1366}
1367
c017a39f 1368template<> Vc_ALWAYS_INLINE Vector<float8> Vector<float8>::Random()
f22341db 1369{
1370 Vector<unsigned int> state0, state1;
1371 _doRandomStep(state0, state1);
1372 state1 ^= state0 >> 16;
1373 return M256::create(
1374 _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one()),
1375 _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state1.data(), 2)), VectorHelper<float>::one()), VectorHelper<float>::one())
1376 );
1377}
1378
c017a39f 1379template<> Vc_ALWAYS_INLINE Vector<double> Vector<double>::Random()
f22341db 1380{
c017a39f 1381 typedef unsigned long long uint64 Vc_MAY_ALIAS;
f22341db 1382 uint64 state0 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[8]);
1383 uint64 state1 = *reinterpret_cast<const uint64 *>(&Vc::RandomState[10]);
1384 const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Vc::RandomState[8]));
1385 *reinterpret_cast<uint64 *>(&Vc::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
1386 *reinterpret_cast<uint64 *>(&Vc::RandomState[10]) = (state1 * 0x5deece66dull + 11);
1387 return (Vector<double>(_mm_castsi128_pd(_mm_srli_epi64(state, 12))) | One()) - One();
1388}
c017a39f 1389// shifted / rotated {{{1
1390template<typename T> Vc_INTRINSIC Vc_PURE Vector<T> Vector<T>::shifted(int amount) const
1391{
79c86c14 1392 enum {
1393 EntryTypeSizeof = sizeof(EntryType)
1394 };
c017a39f 1395 switch (amount) {
1396 case 0: return *this;
79c86c14 1397 case 1: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
1398 case 2: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
1399 case 3: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
1400 case 4: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
1401 case 5: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
1402 case 6: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
1403 case 7: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
1404 case 8: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
1405 case -1: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
1406 case -2: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
1407 case -3: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
1408 case -4: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
1409 case -5: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
1410 case -6: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
1411 case -7: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
1412 case -8: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
c017a39f 1413 }
1414 return Zero();
1415}
1416template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::shifted(int amount) const
1417{
79c86c14 1418 enum {
1419 EntryTypeSizeof = sizeof(EntryType)
1420 };
c017a39f 1421 switch (amount) {
79c86c14 1422 case -7: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)));
1423 case -6: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)));
1424 case -5: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)));
c017a39f 1425 case -4: return M256::create(_mm_setzero_ps(), d.v()[0]);
79c86c14 1426 case -3: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)));
1427 case -2: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)));
1428 case -1: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)));
c017a39f 1429 case 0: return *this;
79c86c14 1430 case 1: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof)));
1431 case 2: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof)));
1432 case 3: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof)));
c017a39f 1433 case 4: return M256::create(d.v()[1], _mm_setzero_ps());
79c86c14 1434 case 5: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof)), _mm_setzero_ps());
1435 case 6: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof)), _mm_setzero_ps());
1436 case 7: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof)), _mm_setzero_ps());
c017a39f 1437 }
1438 return Zero();
1439}
1440template<typename T> Vc_INTRINSIC Vc_PURE Vector<T> Vector<T>::rotated(int amount) const
1441{
79c86c14 1442 enum {
1443 EntryTypeSizeof = sizeof(EntryType)
1444 };
c017a39f 1445 const __m128i v = mm128_reinterpret_cast<__m128i>(d.v());
1446 switch (static_cast<unsigned int>(amount) % Size) {
1447 case 0: return *this;
79c86c14 1448 case 1: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 1 * EntryTypeSizeof));
1449 case 2: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 2 * EntryTypeSizeof));
1450 case 3: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 3 * EntryTypeSizeof));
c017a39f 1451 // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake.
1452 // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType))
1453 // disables the following four calls unless sizeof(EntryType) == 2.
79c86c14 1454 case 4: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 4 * EntryTypeSizeof));
1455 case 5: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 5 * EntryTypeSizeof));
1456 case 6: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 6 * EntryTypeSizeof));
1457 case 7: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 7 * EntryTypeSizeof));
c017a39f 1458 }
1459 return Zero();
1460}
1461template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::rotated(int amount) const
1462{
79c86c14 1463 enum {
1464 EntryTypeSizeof = sizeof(EntryType)
1465 };
c017a39f 1466 const __m128i v0 = sse_cast<__m128i>(d.v()[0]);
1467 const __m128i v1 = sse_cast<__m128i>(d.v()[1]);
1468 switch (static_cast<unsigned int>(amount) % Size) {
1469 case 0: return *this;
79c86c14 1470 case 1: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof)));
1471 case 2: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof)));
1472 case 3: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof)));
c017a39f 1473 case 4: return M256::create(d.v()[1], d.v()[0]);
79c86c14 1474 case 5: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof)));
1475 case 6: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof)));
1476 case 7: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof)));
c017a39f 1477 }
1478 return Zero();
1479}
1480// }}}1
1481// sorted specializations {{{1
1482template<> inline Vc_PURE uint_v uint_v::sorted() const
1483{
1484 __m128i x = data();
1485 __m128i y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
79c86c14 1486 __m128i l = mm_min_epu32(x, y);
1487 __m128i h = mm_max_epu32(x, y);
c017a39f 1488 x = _mm_unpacklo_epi32(l, h);
1489 y = _mm_unpackhi_epi32(h, l);
1490
1491 // sort quads
79c86c14 1492 l = mm_min_epu32(x, y);
1493 h = mm_max_epu32(x, y);
c017a39f 1494 x = _mm_unpacklo_epi32(l, h);
1495 y = _mm_unpackhi_epi64(x, x);
1496
79c86c14 1497 l = mm_min_epu32(x, y);
1498 h = mm_max_epu32(x, y);
c017a39f 1499 return _mm_unpacklo_epi32(l, h);
1500}
1501template<> inline Vc_PURE ushort_v ushort_v::sorted() const
1502{
1503 __m128i lo, hi, y, x = data();
1504 // sort pairs
1505 y = Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(x);
79c86c14 1506 lo = mm_min_epu16(x, y);
1507 hi = mm_max_epu16(x, y);
1508 x = mm_blend_epi16(lo, hi, 0xaa);
c017a39f 1509
1510 // merge left and right quads
1511 y = Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(x);
79c86c14 1512 lo = mm_min_epu16(x, y);
1513 hi = mm_max_epu16(x, y);
1514 x = mm_blend_epi16(lo, hi, 0xcc);
c017a39f 1515 y = _mm_srli_si128(x, 2);
79c86c14 1516 lo = mm_min_epu16(x, y);
1517 hi = mm_max_epu16(x, y);
1518 x = mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
c017a39f 1519
1520 // merge quads into octs
1521 y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
1522 y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
79c86c14 1523 lo = mm_min_epu16(x, y);
1524 hi = mm_max_epu16(x, y);
c017a39f 1525
1526 x = _mm_unpacklo_epi16(lo, hi);
1527 y = _mm_srli_si128(x, 8);
79c86c14 1528 lo = mm_min_epu16(x, y);
1529 hi = mm_max_epu16(x, y);
c017a39f 1530
1531 x = _mm_unpacklo_epi16(lo, hi);
1532 y = _mm_srli_si128(x, 8);
79c86c14 1533 lo = mm_min_epu16(x, y);
1534 hi = mm_max_epu16(x, y);
c017a39f 1535
1536 return _mm_unpacklo_epi16(lo, hi);
1537}
f22341db 1538// }}}1
1539} // namespace SSE
1540} // namespace Vc
c017a39f 1541} // namespace AliRoot
f22341db 1542
1543#include "undomacros.h"
1544
1545// vim: foldmethod=marker