1 /* This file is part of the Vc library.
3 Copyright (C) 2009-2011 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
20 #ifndef SSE_VECTORHELPER_H
21 #define SSE_VECTORHELPER_H
30 template<typename VectorType, unsigned int Size> struct SortHelper
32 static VectorType sort(VectorType) PURE;
34 template<unsigned int Size> struct SortHelper<M256, Size>
36 static M256 sort(const M256 &) PURE;
41 #undef PARENT_DATA_CONST
43 #define OP0(name, code) static inline VectorType name() PURE { return code; }
44 #define OP1(name, code) static inline VectorType name(const VectorType &a) PURE { return code; }
45 #define OP2(name, code) static inline VectorType name(const VectorType &a, const VectorType &b) PURE { return code; }
46 #define OP3(name, code) static inline VectorType name(const VectorType &a, const VectorType &b, const VectorType &c) PURE { return code; }
48 template<> struct VectorHelper<_M128>
50 typedef _M128 VectorType;
51 template<typename A> static VectorType load(const float *x, A) PURE;
52 static void store(float *mem, const VectorType x, AlignedFlag);
53 static void store(float *mem, const VectorType x, UnalignedFlag);
54 static void store(float *mem, const VectorType x, StreamingAndAlignedFlag);
55 static void store(float *mem, const VectorType x, StreamingAndUnalignedFlag);
56 static void store(float *mem, const VectorType x, const VectorType m, AlignedFlag);
57 static void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag);
58 static void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
59 static void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
61 OP0(allone, _mm_setallone_ps())
62 OP0(zero, _mm_setzero_ps())
63 OP2(or_, _mm_or_ps(a, b))
64 OP2(xor_, _mm_xor_ps(a, b))
65 OP2(and_, _mm_and_ps(a, b))
66 OP2(andnot_, _mm_andnot_ps(a, b))
67 OP3(blend, _mm_blendv_ps(a, b, c))
71 template<> struct VectorHelper<M256>
73 typedef M256 VectorType;
74 template<typename A> static VectorType load(const float *x, A) PURE;
75 static void store(float *mem, const VectorType &x, AlignedFlag);
76 static void store(float *mem, const VectorType &x, UnalignedFlag);
77 static void store(float *mem, const VectorType &x, StreamingAndAlignedFlag);
78 static void store(float *mem, const VectorType &x, StreamingAndUnalignedFlag);
79 static void store(float *mem, const VectorType &x, const VectorType &m, AlignedFlag);
80 static void store(float *mem, const VectorType &x, const VectorType &m, UnalignedFlag);
81 static void store(float *mem, const VectorType &x, const VectorType &m, StreamingAndAlignedFlag);
82 static void store(float *mem, const VectorType &x, const VectorType &m, StreamingAndUnalignedFlag);
84 OP0(allone, VectorType::create(_mm_setallone_ps(), _mm_setallone_ps()))
85 OP0(zero, VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()))
86 OP2(or_, VectorType::create(_mm_or_ps(a[0], b[0]), _mm_or_ps(a[1], b[1])))
87 OP2(xor_, VectorType::create(_mm_xor_ps(a[0], b[0]), _mm_xor_ps(a[1], b[1])))
88 OP2(and_, VectorType::create(_mm_and_ps(a[0], b[0]), _mm_and_ps(a[1], b[1])))
89 OP2(andnot_, VectorType::create(_mm_andnot_ps(a[0], b[0]), _mm_andnot_ps(a[1], b[1])))
90 OP3(blend, VectorType::create(_mm_blendv_ps(a[0], b[0], c[0]), _mm_blendv_ps(a[1], b[1], c[1])))
93 template<> struct VectorHelper<_M128D>
95 typedef _M128D VectorType;
96 template<typename A> static VectorType load(const double *x, A) PURE;
97 static void store(double *mem, const VectorType x, AlignedFlag);
98 static void store(double *mem, const VectorType x, UnalignedFlag);
99 static void store(double *mem, const VectorType x, StreamingAndAlignedFlag);
100 static void store(double *mem, const VectorType x, StreamingAndUnalignedFlag);
101 static void store(double *mem, const VectorType x, const VectorType m, AlignedFlag);
102 static void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag);
103 static void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
104 static void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
106 OP0(allone, _mm_setallone_pd())
107 OP0(zero, _mm_setzero_pd())
108 OP2(or_, _mm_or_pd(a, b))
109 OP2(xor_, _mm_xor_pd(a, b))
110 OP2(and_, _mm_and_pd(a, b))
111 OP2(andnot_, _mm_andnot_pd(a, b))
112 OP3(blend, _mm_blendv_pd(a, b, c))
115 template<> struct VectorHelper<_M128I>
117 typedef _M128I VectorType;
118 template<typename T> static VectorType load(const T *x, AlignedFlag) PURE;
119 template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE;
120 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE;
121 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE;
122 template<typename T> static void store(T *mem, const VectorType x, AlignedFlag);
123 template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag);
124 template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag);
125 template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag);
126 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag);
127 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag);
128 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
129 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
131 OP0(allone, _mm_setallone_si128())
132 OP0(zero, _mm_setzero_si128())
133 OP2(or_, _mm_or_si128(a, b))
134 OP2(xor_, _mm_xor_si128(a, b))
135 OP2(and_, _mm_and_si128(a, b))
136 OP2(andnot_, _mm_andnot_si128(a, b))
137 OP3(blend, _mm_blendv_epi8(a, b, c))
145 static inline VectorType op(const VectorType &a) PURE { return CAT(_mm_##op##_, SUFFIX)(a); }
147 static inline VectorType op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_##op##_ , SUFFIX)(a, b); }
149 static inline VectorType op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_##op , SUFFIX)(a, b); }
150 #define OPx(op, op2) \
151 static inline VectorType op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_##op2##_, SUFFIX)(a, b); }
153 static inline VectorType cmp##op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_cmp##op##_, SUFFIX)(a, b); }
154 #define OP_CAST_(op) \
155 static inline VectorType op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_castps_, SUFFIX)( \
156 _mm_##op##ps(CAT(CAT(_mm_cast, SUFFIX), _ps)(a), \
157 CAT(CAT(_mm_cast, SUFFIX), _ps)(b))); \
160 static inline VectorType min(VectorType a, VectorType b) PURE { return CAT(_mm_min_, SUFFIX)(a, b); } \
161 static inline VectorType max(VectorType a, VectorType b) PURE { return CAT(_mm_max_, SUFFIX)(a, b); }
163 template<> struct VectorHelper<double> {
164 typedef _M128D VectorType;
165 typedef double EntryType;
168 OP_(or_) OP_(and_) OP_(xor_)
169 static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_pd(mask), a); }
170 static inline VectorType set(const double a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
171 static inline VectorType set(const double a, const double b) PURE { return CAT(_mm_set_, SUFFIX)(a, b); }
172 static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
173 static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }// set(1.); }
175 static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
176 static inline VectorType mul(VectorType a, VectorType b, _M128 _mask) PURE {
177 _M128D mask = _mm_castps_pd(_mask);
179 _mm_and_pd(mask, _mm_mul_pd(a, b)),
180 _mm_andnot_pd(mask, a)
184 OP(add) OP(sub) OP(mul)
190 static inline VectorType rsqrt(VectorType x) PURE {
191 return _mm_div_pd(one(), sqrt(x));
193 static inline VectorType reciprocal(VectorType x) PURE {
194 return _mm_div_pd(one(), x);
196 static inline VectorType isNaN(VectorType x) PURE {
197 return _mm_cmpunord_pd(x, x);
199 static inline VectorType isFinite(VectorType x) PURE {
200 return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
202 static inline VectorType abs(const VectorType a) PURE {
203 return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_pd());
207 static inline EntryType min(VectorType a) PURE {
208 a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
209 return _mm_cvtsd_f64(a);
211 static inline EntryType max(VectorType a) PURE {
212 a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
213 return _mm_cvtsd_f64(a);
215 static inline EntryType mul(VectorType a) PURE {
216 a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
217 return _mm_cvtsd_f64(a);
219 static inline EntryType add(VectorType a) PURE {
220 a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
221 return _mm_cvtsd_f64(a);
224 static inline VectorType round(VectorType a) PURE {
226 return _mm_round_pd(a, _MM_FROUND_NINT);
228 //XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
229 return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
234 template<> struct VectorHelper<float> {
235 typedef float EntryType;
236 typedef _M128 VectorType;
239 OP_(or_) OP_(and_) OP_(xor_)
240 static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(mask, a); }
241 static inline VectorType set(const float a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
242 static inline VectorType set(const float a, const float b, const float c, const float d) PURE { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
243 static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
244 static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }// set(1.f); }
245 static inline _M128 concat(_M128D a, _M128D b) PURE { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
247 static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
248 static inline VectorType mul(VectorType a, VectorType b, _M128 mask) PURE {
250 _mm_and_ps(mask, _mm_mul_ps(a, b)),
251 _mm_andnot_ps(mask, a)
255 OP(add) OP(sub) OP(mul)
261 static inline VectorType isNaN(VectorType x) PURE {
262 return _mm_cmpunord_ps(x, x);
264 static inline VectorType isFinite(VectorType x) PURE {
265 return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
267 static inline VectorType reciprocal(VectorType x) PURE {
268 return _mm_rcp_ps(x);
270 static inline VectorType abs(const VectorType a) PURE {
271 return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_ps());
275 static inline EntryType min(VectorType a) PURE {
276 a = _mm_min_ps(a, _mm_movehl_ps(a, a)); // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
277 a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3
278 return _mm_cvtss_f32(a);
280 static inline EntryType max(VectorType a) PURE {
281 a = _mm_max_ps(a, _mm_movehl_ps(a, a)); // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
282 a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3
283 return _mm_cvtss_f32(a);
285 static inline EntryType mul(VectorType a) PURE {
286 a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
287 a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
288 return _mm_cvtss_f32(a);
290 static inline EntryType add(VectorType a) PURE {
291 a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
292 a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
293 return _mm_cvtss_f32(a);
296 static inline VectorType round(VectorType a) PURE {
298 return _mm_round_ps(a, _MM_FROUND_NINT);
300 //XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
301 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
306 template<> struct VectorHelper<float8> {
307 typedef float EntryType;
308 typedef M256 VectorType;
310 static inline VectorType set(const float a) PURE {
311 const _M128 x = _mm_set1_ps(a);
312 return VectorType::create(x, x);
314 static inline VectorType set(const float a, const float b, const float c, const float d) PURE {
315 const _M128 x = _mm_set_ps(a, b, c, d);
316 return VectorType::create(x, x);
318 static inline VectorType set(const float a, const float b, const float c, const float d,
319 const float e, const float f, const float g, const float h) PURE {
320 return VectorType::create(_mm_set_ps(a, b, c, d), _mm_set_ps(e, f, g, h));
322 static inline VectorType zero() PURE { return VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()); }
323 static inline VectorType one() PURE { return set(1.f); }
325 #define REUSE_FLOAT_IMPL1(fun) \
326 static inline VectorType fun(const VectorType &x) PURE { \
327 return VectorType::create(VectorHelper<float>::fun(x[0]), VectorHelper<float>::fun(x[1])); \
329 #define REUSE_FLOAT_IMPL2(fun) \
330 static inline VectorType fun(const VectorType &x, const VectorType &y) PURE { \
331 return VectorType::create(VectorHelper<float>::fun(x[0], y[0]), VectorHelper<float>::fun(x[1], y[1])); \
333 #define REUSE_FLOAT_IMPL3(fun) \
334 static inline VectorType fun(const VectorType &x, const VectorType &y, const VectorType &z) PURE { \
335 return VectorType::create(VectorHelper<float>::fun(x[0], y[0], z[0]), VectorHelper<float>::fun(x[1], y[1], z[1])); \
337 REUSE_FLOAT_IMPL1(reciprocal)
338 REUSE_FLOAT_IMPL1(sqrt)
339 REUSE_FLOAT_IMPL1(rsqrt)
340 REUSE_FLOAT_IMPL1(isNaN)
341 REUSE_FLOAT_IMPL1(isFinite)
342 REUSE_FLOAT_IMPL1(abs)
343 REUSE_FLOAT_IMPL1(round)
345 REUSE_FLOAT_IMPL2(and_)
346 REUSE_FLOAT_IMPL2(or_)
347 REUSE_FLOAT_IMPL2(xor_)
348 REUSE_FLOAT_IMPL2(notMaskedToZero)
349 REUSE_FLOAT_IMPL2(add)
350 REUSE_FLOAT_IMPL2(sub)
351 REUSE_FLOAT_IMPL2(mul)
352 REUSE_FLOAT_IMPL2(cmple)
353 REUSE_FLOAT_IMPL2(cmpnle)
354 REUSE_FLOAT_IMPL2(cmplt)
355 REUSE_FLOAT_IMPL2(cmpnlt)
356 REUSE_FLOAT_IMPL2(cmpeq)
357 REUSE_FLOAT_IMPL2(cmpneq)
358 REUSE_FLOAT_IMPL2(min)
359 REUSE_FLOAT_IMPL2(max)
361 static inline EntryType min(const VectorType &a) PURE {
362 return VectorHelper<float>::min(VectorHelper<float>::min(a[0], a[1]));
364 static inline EntryType max(const VectorType &a) PURE {
365 return VectorHelper<float>::max(VectorHelper<float>::max(a[0], a[1]));
367 static inline EntryType mul(const VectorType &a) PURE {
368 return VectorHelper<float>::mul(VectorHelper<float>::mul(a[0], a[1]));
370 static inline EntryType add(const VectorType &a) PURE {
371 return VectorHelper<float>::add(VectorHelper<float>::add(a[0], a[1]));
374 static inline void multiplyAndAdd(VectorType &a, const VectorType &b, const VectorType &c) {
375 VectorHelper<float>::multiplyAndAdd(a[0], b[0], c[0]);
376 VectorHelper<float>::multiplyAndAdd(a[1], b[1], c[1]);
378 REUSE_FLOAT_IMPL3(mul)
379 #undef REUSE_FLOAT_IMPL3
380 #undef REUSE_FLOAT_IMPL2
381 #undef REUSE_FLOAT_IMPL1
384 template<> struct VectorHelper<int> {
385 typedef int EntryType;
386 typedef _M128I VectorType;
389 OP_(or_) OP_(and_) OP_(xor_)
390 static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
391 static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
394 static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }
396 static inline VectorType set(const int a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
397 static inline VectorType set(const int a, const int b, const int c, const int d) PURE { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
399 static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
401 static inline VectorType shiftLeft(VectorType a, int shift) {
402 return CAT(_mm_slli_, SUFFIX)(a, shift);
404 static inline VectorType shiftRight(VectorType a, int shift) {
405 return CAT(_mm_srai_, SUFFIX)(a, shift);
410 static inline EntryType min(VectorType a) PURE {
411 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
412 // using lo_epi16 for speed here
413 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
414 return _mm_cvtsi128_si32(a);
416 static inline EntryType max(VectorType a) PURE {
417 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
418 // using lo_epi16 for speed here
419 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
420 return _mm_cvtsi128_si32(a);
422 static inline EntryType add(VectorType a) PURE {
423 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
424 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
425 return _mm_cvtsi128_si32(a);
428 static inline VectorType mul(VectorType a, VectorType b) PURE { return _mm_mullo_epi32(a, b); }
429 static inline EntryType mul(VectorType a) PURE {
430 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
431 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
432 return _mm_cvtsi128_si32(a);
435 static inline VectorType mul(const VectorType &a, const VectorType &b) PURE {
436 const VectorType &aShift = _mm_srli_si128(a, 4);
437 const VectorType &ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2]
438 const VectorType &bShift = _mm_srli_si128(b, 4);
439 const VectorType &ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3]
440 return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
443 static inline VectorType mul(const VectorType a, const VectorType b, _M128 _mask) PURE {
444 return _mm_blendv_epi8(a, mul(a, b), _mm_castps_si128(_mask));
451 static inline VectorType cmpneq(const VectorType &a, const VectorType &b) PURE { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
452 static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) PURE { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
453 static inline VectorType cmple (const VectorType &a, const VectorType &b) PURE { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
454 static inline VectorType cmpnle(const VectorType &a, const VectorType &b) PURE { return cmpgt(a, b); }
456 static inline VectorType round(VectorType a) PURE { return a; }
459 template<> struct VectorHelper<unsigned int> {
460 typedef unsigned int EntryType;
461 typedef _M128I VectorType;
463 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
464 static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
465 static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
469 static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }
472 static inline EntryType min(VectorType a) PURE {
473 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
474 // using lo_epi16 for speed here
475 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
476 return _mm_cvtsi128_si32(a);
478 static inline EntryType max(VectorType a) PURE {
479 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
480 // using lo_epi16 for speed here
481 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
482 return _mm_cvtsi128_si32(a);
484 static inline EntryType mul(VectorType a) PURE {
485 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
486 // using lo_epi16 for speed here
487 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
488 return _mm_cvtsi128_si32(a);
490 static inline EntryType add(VectorType a) PURE {
491 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
492 // using lo_epi16 for speed here
493 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
494 return _mm_cvtsi128_si32(a);
497 static inline VectorType mul(const VectorType a, const VectorType b, _M128 _mask) PURE {
498 return _mm_blendv_epi8(a, mul(a, b), _mm_castps_si128(_mask));
500 static inline VectorType mul(const VectorType &a, const VectorType &b) PURE {
501 return VectorHelper<int>::mul(a, b);
503 //X template<unsigned int b> static inline VectorType mul(const VectorType a) PURE {
505 //X case 0: return zero();
506 //X case 1: return a;
507 //X case 2: return _mm_slli_epi32(a, 1);
508 //X case 4: return _mm_slli_epi32(a, 2);
509 //X case 8: return _mm_slli_epi32(a, 3);
510 //X case 16: return _mm_slli_epi32(a, 4);
511 //X case 32: return _mm_slli_epi32(a, 5);
512 //X case 64: return _mm_slli_epi32(a, 6);
513 //X case 128: return _mm_slli_epi32(a, 7);
514 //X case 256: return _mm_slli_epi32(a, 8);
515 //X case 512: return _mm_slli_epi32(a, 9);
516 //X case 1024: return _mm_slli_epi32(a, 10);
517 //X case 2048: return _mm_slli_epi32(a, 11);
519 //X return mul(a, set(b));
524 static inline VectorType shiftLeft(VectorType a, int shift) {
525 return CAT(_mm_slli_, SUFFIX)(a, shift);
527 static inline VectorType shiftRight(VectorType a, int shift) {
528 return CAT(_mm_srli_, SUFFIX)(a, shift);
530 static inline VectorType set(const unsigned int a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
531 static inline VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) PURE { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
535 static inline VectorType cmpneq(const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
537 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
538 static inline VectorType cmplt(const VectorType &a, const VectorType &b) PURE {
539 return _mm_cmplt_epu32(a, b);
541 static inline VectorType cmpgt(const VectorType &a, const VectorType &b) PURE {
542 return _mm_cmpgt_epu32(a, b);
548 static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
549 static inline VectorType cmple (const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
550 static inline VectorType cmpnle(const VectorType &a, const VectorType &b) PURE { return cmpgt(a, b); }
553 static inline VectorType round(VectorType a) PURE { return a; }
556 template<> struct VectorHelper<signed short> {
557 typedef _M128I VectorType;
558 typedef signed short EntryType;
561 OP_(or_) OP_(and_) OP_(xor_)
562 static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
563 static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
564 static inline _M128I concat(_M128I a, _M128I b) PURE { return _mm_packs_epi32(a, b); }
565 static inline _M128I expand0(_M128I x) PURE { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
566 static inline _M128I expand1(_M128I x) PURE { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
570 static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }
572 static inline VectorType shiftLeft(VectorType a, int shift) {
573 return CAT(_mm_slli_, SUFFIX)(a, shift);
575 static inline VectorType shiftRight(VectorType a, int shift) {
576 return CAT(_mm_srai_, SUFFIX)(a, shift);
578 static inline VectorType set(const EntryType a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
579 static inline VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
580 const EntryType e, const EntryType f, const EntryType g, const EntryType h) PURE {
581 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
584 static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) {
585 v1 = add(mul(v1, v2), v3); }
589 static inline VectorType mul(VectorType a, VectorType b, _M128 _mask) PURE {
590 _M128I mask = _mm_castps_si128(_mask);
592 _mm_and_si128(mask, mul(a, b)),
593 _mm_andnot_si128(mask, a)
598 static inline EntryType min(VectorType a) PURE {
599 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
600 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
601 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
602 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
603 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
605 static inline EntryType max(VectorType a) PURE {
606 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
607 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
608 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
609 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
610 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
612 static inline EntryType mul(VectorType a) PURE {
613 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
614 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
615 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
616 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
618 static inline EntryType add(VectorType a) PURE {
619 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
620 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
621 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
622 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
629 static inline VectorType cmpneq(const VectorType &a, const VectorType &b) PURE { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
630 static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) PURE { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
631 static inline VectorType cmple (const VectorType &a, const VectorType &b) PURE { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
632 static inline VectorType cmpnle(const VectorType &a, const VectorType &b) PURE { return cmpgt(a, b); }
634 static inline VectorType round(VectorType a) PURE { return a; }
637 template<> struct VectorHelper<unsigned short> {
638 typedef _M128I VectorType;
639 typedef unsigned short EntryType;
641 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
642 static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
643 static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
645 static inline _M128I concat(_M128I a, _M128I b) PURE { return _mm_packus_epi32(a, b); }
647 // XXX too bad, but this is broken without SSE 4.1
648 static inline _M128I concat(_M128I a, _M128I b) PURE { return _mm_packs_epi32(a, b); }
650 static inline _M128I expand0(_M128I x) PURE { return _mm_srli_epi32(_mm_unpacklo_epi16(x, x), 16); }
651 static inline _M128I expand1(_M128I x) PURE { return _mm_srli_epi32(_mm_unpackhi_epi16(x, x), 16); }
655 static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }
657 static inline VectorType mul(VectorType a, VectorType b, _M128 _mask) PURE {
658 _M128I mask = _mm_castps_si128(_mask);
660 _mm_and_si128(mask, mul(a, b)),
661 _mm_andnot_si128(mask, a)
664 //X template<unsigned int b> static inline VectorType mul(const VectorType a) PURE {
666 //X case 0: return zero();
667 //X case 1: return a;
668 //X case 2: return _mm_slli_epi16(a, 1);
669 //X case 4: return _mm_slli_epi16(a, 2);
670 //X case 8: return _mm_slli_epi16(a, 3);
671 //X case 16: return _mm_slli_epi16(a, 4);
672 //X case 32: return _mm_slli_epi16(a, 5);
673 //X case 64: return _mm_slli_epi16(a, 6);
674 //X case 128: return _mm_slli_epi16(a, 7);
675 //X case 256: return _mm_slli_epi16(a, 8);
676 //X case 512: return _mm_slli_epi16(a, 9);
677 //X case 1024: return _mm_slli_epi16(a, 10);
678 //X case 2048: return _mm_slli_epi16(a, 11);
680 //X return mul(a, set(b));
682 #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || VC_IMPL_SSE4_1
687 static inline VectorType shiftLeft(VectorType a, int shift) {
688 return CAT(_mm_slli_, SUFFIX)(a, shift);
690 static inline VectorType shiftRight(VectorType a, int shift) {
691 return CAT(_mm_srli_, SUFFIX)(a, shift);
693 OPx(mul, mullo) // should work correctly for all values
694 #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(VC_IMPL_SSE4_1)
695 OP(min) OP(max) // XXX breaks for values with MSB set
697 static inline EntryType min(VectorType a) PURE {
698 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
699 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
700 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
701 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
702 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
704 static inline EntryType max(VectorType a) PURE {
705 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
706 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
707 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
708 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
709 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
711 static inline EntryType mul(VectorType a) PURE {
712 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
713 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
714 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
715 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
716 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
718 static inline EntryType add(VectorType a) PURE {
719 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
720 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
721 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
722 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
723 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
725 static inline VectorType set(const EntryType a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
726 static inline VectorType set(const EntryType a, const EntryType b, const EntryType c,
727 const EntryType d, const EntryType e, const EntryType f,
728 const EntryType g, const EntryType h) PURE {
729 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
734 static inline VectorType cmpneq(const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
736 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
737 static inline VectorType cmplt(const VectorType &a, const VectorType &b) PURE {
738 return _mm_cmplt_epu16(a, b);
740 static inline VectorType cmpgt(const VectorType &a, const VectorType &b) PURE {
741 return _mm_cmpgt_epu16(a, b);
747 static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
748 static inline VectorType cmple (const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
749 static inline VectorType cmpnle(const VectorType &a, const VectorType &b) PURE { return cmpgt(a, b); }
751 static inline VectorType round(VectorType a) PURE { return a; }
762 #include "vectorhelper.tcc"
764 #endif // SSE_VECTORHELPER_H