1 /* This file is part of the Vc library.
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
20 #ifndef AVX_VECTORHELPER_H
21 #define AVX_VECTORHELPER_H
25 #include "intrinsics.h"
32 #define OP0(name, code) static inline VectorType name() { return code; }
33 #define OP1(name, code) static inline VectorType name(const VectorType &a) { return code; }
34 #define OP2(name, code) static inline VectorType name(const VectorType &a, const VectorType &b) { return code; }
35 #define OP3(name, code) static inline VectorType name(const VectorType &a, const VectorType &b, const VectorType &c) { return code; }
37 template<> struct VectorHelper<_M256>
39 typedef _M256 VectorType;
40 template<typename A> static VectorType load(const float *x, A) PURE;
41 static void store(float *mem, const VectorType x, AlignedFlag);
42 static void store(float *mem, const VectorType x, UnalignedFlag);
43 static void store(float *mem, const VectorType x, StreamingAndAlignedFlag);
44 static void store(float *mem, const VectorType x, StreamingAndUnalignedFlag);
45 static void store(float *mem, const VectorType x, const VectorType m, AlignedFlag);
46 static void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag);
47 static void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
48 static void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
50 static VectorType cdab(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); }
51 static VectorType badc(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); }
52 static VectorType aaaa(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); }
53 static VectorType bbbb(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); }
54 static VectorType cccc(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); }
55 static VectorType dddd(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); }
56 static VectorType dacb(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); }
58 OP0(allone, _mm256_setallone_ps())
59 OP0(zero, _mm256_setzero_ps())
60 OP2(or_, _mm256_or_ps(a, b))
61 OP2(xor_, _mm256_xor_ps(a, b))
62 OP2(and_, _mm256_and_ps(a, b))
63 OP2(andnot_, _mm256_andnot_ps(a, b))
64 OP3(blend, _mm256_blendv_ps(a, b, c))
67 template<> struct VectorHelper<_M256D>
69 typedef _M256D VectorType;
70 template<typename A> static VectorType load(const double *x, A) PURE;
71 static void store(double *mem, const VectorType x, AlignedFlag);
72 static void store(double *mem, const VectorType x, UnalignedFlag);
73 static void store(double *mem, const VectorType x, StreamingAndAlignedFlag);
74 static void store(double *mem, const VectorType x, StreamingAndUnalignedFlag);
75 static void store(double *mem, const VectorType x, const VectorType m, AlignedFlag);
76 static void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag);
77 static void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
78 static void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
80 static VectorType cdab(VectorType x) { return _mm256_permute_pd(x, 5); }
81 static VectorType badc(VectorType x) { return _mm256_permute2f128_pd(x, x, 1); }
82 // aaaa bbbb cccc dddd specialized in vector.tcc
83 static VectorType dacb(VectorType x) {
84 const __m128d cb = avx_cast<__m128d>(_mm_alignr_epi8(avx_cast<__m128i>(lo128(x)),
85 avx_cast<__m128i>(hi128(x)), sizeof(double))); // XXX: lo and hi swapped?
86 const __m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped?
87 return concat(cb, da);
90 OP0(allone, _mm256_setallone_pd())
91 OP0(zero, _mm256_setzero_pd())
92 OP2(or_, _mm256_or_pd(a, b))
93 OP2(xor_, _mm256_xor_pd(a, b))
94 OP2(and_, _mm256_and_pd(a, b))
95 OP2(andnot_, _mm256_andnot_pd(a, b))
96 OP3(blend, _mm256_blendv_pd(a, b, c))
99 template<> struct VectorHelper<_M256I>
101 typedef _M256I VectorType;
102 template<typename T> static VectorType load(const T *x, AlignedFlag) PURE;
103 template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE;
104 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE;
105 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE;
106 template<typename T> static void store(T *mem, const VectorType x, AlignedFlag);
107 template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag);
108 template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag);
109 template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag);
110 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag);
111 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag);
112 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
113 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
115 static VectorType cdab(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(2, 3, 0, 1))); }
116 static VectorType badc(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(1, 0, 3, 2))); }
117 static VectorType aaaa(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(0, 0, 0, 0))); }
118 static VectorType bbbb(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(1, 1, 1, 1))); }
119 static VectorType cccc(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(2, 2, 2, 2))); }
120 static VectorType dddd(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(3, 3, 3, 3))); }
121 static VectorType dacb(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(3, 0, 2, 1))); }
123 OP0(allone, _mm256_setallone_si256())
124 OP0(zero, _mm256_setzero_si256())
125 OP2(or_, _mm256_or_si256(a, b))
126 OP2(xor_, _mm256_xor_si256(a, b))
127 OP2(and_, _mm256_and_si256(a, b))
128 OP2(andnot_, _mm256_andnot_si256(a, b))
129 OP3(blend, _mm256_blendv_epi8(a, b, c))
132 template<> struct VectorHelper<__m128i>
134 typedef __m128i VectorType;
135 template<typename T> static VectorType load(const T *x, AlignedFlag) PURE;
136 template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE;
137 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE;
138 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE;
139 template<typename T> static void store(T *mem, const VectorType x, AlignedFlag);
140 template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag);
141 template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag);
142 template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag);
143 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag);
144 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag);
145 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
146 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
148 static VectorType cdab(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); }
149 static VectorType badc(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)), _MM_SHUFFLE(1, 0, 3, 2)); }
150 static VectorType aaaa(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0)); }
151 static VectorType bbbb(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)), _MM_SHUFFLE(1, 1, 1, 1)); }
152 static VectorType cccc(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(2, 2, 2, 2)); }
153 static VectorType dddd(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); }
154 static VectorType dacb(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)), _MM_SHUFFLE(3, 0, 2, 1)); }
156 OP0(allone, _mm_setallone_si128())
157 OP0(zero, _mm_setzero_si128())
158 OP2(or_, _mm_or_si128(a, b))
159 OP2(xor_, _mm_xor_si128(a, b))
160 OP2(and_, _mm_and_si128(a, b))
161 OP2(andnot_, _mm_andnot_si128(a, b))
162 OP3(blend, _mm_blendv_epi8(a, b, c))
169 static inline VectorType INTRINSIC CONST op(const VectorType &a) { return CAT(_mm256_##op##_, SUFFIX)(a); }
171 static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); }
173 static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op , SUFFIX)(a, b); }
174 #define OPx(op, op2) \
175 static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); }
177 static inline VectorType INTRINSIC CONST cmp##op(const VectorType &a, const VectorType &b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); }
178 #define OP_CAST_(op) \
179 static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_castps_, SUFFIX)( \
180 _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \
181 CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \
184 static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \
185 static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return CAT(_mm256_max_, SUFFIX)(a, b); }
187 template<> struct VectorHelper<double> {
188 typedef _M256D VectorType;
189 typedef double EntryType;
190 typedef double ConcatType;
193 static inline VectorType notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); }
194 static inline VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); }
195 static inline VectorType set(const double a, const double b, const double c, const double d) {
196 return CAT(_mm256_set_, SUFFIX)(a, b, c, d);
198 static inline VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
199 static inline VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); }
201 static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
202 static inline VectorType mul(VectorType a, VectorType b, _M256 _mask) {
203 _M256D mask = _mm256_castps_pd(_mask);
205 _mm256_and_pd(mask, _mm256_mul_pd(a, b)),
206 _mm256_andnot_pd(mask, a)
209 static inline VectorType div(VectorType a, VectorType b, _M256 _mask) {
210 _M256D mask = _mm256_castps_pd(_mask);
212 _mm256_and_pd(mask, _mm256_div_pd(a, b)),
213 _mm256_andnot_pd(mask, a)
217 OP(add) OP(sub) OP(mul)
223 static inline VectorType rsqrt(VectorType x) {
224 return _mm256_div_pd(one(), sqrt(x));
226 static inline VectorType reciprocal(VectorType x) {
227 return _mm256_div_pd(one(), x);
229 static inline VectorType isNaN(VectorType x) {
230 return _mm256_cmpunord_pd(x, x);
232 static inline VectorType isFinite(VectorType x) {
233 return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x));
235 static inline VectorType abs(const VectorType a) {
236 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd());
240 static inline EntryType min(VectorType a) {
241 __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
242 b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
243 return _mm_cvtsd_f64(b);
245 static inline EntryType max(VectorType a) {
246 __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
247 b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
248 return _mm_cvtsd_f64(b);
250 static inline EntryType mul(VectorType a) {
251 __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
252 b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
253 return _mm_cvtsd_f64(b);
255 static inline EntryType add(VectorType a) {
256 __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
257 b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
258 return _mm_cvtsd_f64(b);
261 static inline VectorType round(VectorType a) {
262 return _mm256_round_pd(a, _MM_FROUND_NINT);
266 template<> struct VectorHelper<float> {
267 typedef float EntryType;
268 typedef _M256 VectorType;
269 typedef double ConcatType;
272 static inline VectorType notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); }
273 static inline VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); }
274 static inline VectorType set(const float a, const float b, const float c, const float d,
275 const float e, const float f, const float g, const float h) {
276 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
277 static inline VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
278 static inline VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); }
279 static inline _M256 concat(_M256D a, _M256D b) { return _mm256_insertf128_ps(avx_cast<_M256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
281 static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
282 static inline VectorType mul(VectorType a, VectorType b, _M256 mask) {
284 _mm256_and_ps(mask, _mm256_mul_ps(a, b)),
285 _mm256_andnot_ps(mask, a)
288 static inline VectorType div(VectorType a, VectorType b, _M256 mask) {
290 _mm256_and_ps(mask, _mm256_div_ps(a, b)),
291 _mm256_andnot_ps(mask, a)
295 OP(add) OP(sub) OP(mul)
301 static inline VectorType isNaN(VectorType x) {
302 return _mm256_cmpunord_ps(x, x);
304 static inline VectorType isFinite(VectorType x) {
305 return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x));
307 static inline VectorType reciprocal(VectorType x) {
308 return _mm256_rcp_ps(x);
310 static inline VectorType abs(const VectorType a) {
311 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps());
315 static inline EntryType min(VectorType a) {
316 __m128 b = _mm_min_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
317 b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
318 b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
319 return _mm_cvtss_f32(b);
321 static inline EntryType max(VectorType a) {
322 __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
323 b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
324 b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
325 return _mm_cvtss_f32(b);
327 static inline EntryType mul(VectorType a) {
328 __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
329 b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
330 b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
331 return _mm_cvtss_f32(b);
333 static inline EntryType add(VectorType a) {
334 __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
335 b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
336 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
337 return _mm_cvtss_f32(b);
340 static inline VectorType round(VectorType a) {
341 return _mm256_round_ps(a, _MM_FROUND_NINT);
345 template<> struct VectorHelper<sfloat> : public VectorHelper<float> {};
347 template<> struct VectorHelper<int> {
348 typedef int EntryType;
349 typedef _M256I VectorType;
350 typedef long long ConcatType;
353 OP_(or_) OP_(and_) OP_(xor_)
354 static inline VectorType INTRINSIC CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
355 static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
358 static inline VectorType INTRINSIC CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
360 static inline VectorType INTRINSIC CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
361 static inline VectorType INTRINSIC CONST set(const int a, const int b, const int c, const int d,
362 const int e, const int f, const int g, const int h) {
363 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
365 static inline void INTRINSIC CONST multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
367 static inline VectorType shiftLeft(VectorType a, int shift) {
368 return CAT(_mm256_slli_, SUFFIX)(a, shift);
370 static inline VectorType shiftRight(VectorType a, int shift) {
371 return CAT(_mm256_srai_, SUFFIX)(a, shift);
376 static inline EntryType INTRINSIC CONST min(VectorType a) {
377 __m128i b = _mm_min_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
378 b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
379 b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
380 return _mm_cvtsi128_si32(b);
382 static inline EntryType INTRINSIC CONST max(VectorType a) {
383 __m128i b = _mm_max_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
384 b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
385 b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
386 return _mm_cvtsi128_si32(b);
388 static inline EntryType INTRINSIC CONST add(VectorType a) {
389 __m128i b = _mm_add_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
390 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
391 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
392 return _mm_cvtsi128_si32(b);
394 static inline EntryType INTRINSIC CONST mul(VectorType a) {
395 __m128i b = _mm_mullo_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
396 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
397 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
398 return _mm_cvtsi128_si32(b);
401 static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm256_mullo_epi32(a, b); }
407 static inline VectorType INTRINSIC CONST cmpneq(const VectorType &a, const VectorType &b) { _M256I x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
408 static inline VectorType INTRINSIC CONST cmpnlt(const VectorType &a, const VectorType &b) { _M256I x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
409 static inline VectorType INTRINSIC CONST cmple (const VectorType &a, const VectorType &b) { _M256I x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
410 static inline VectorType INTRINSIC CONST cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
412 static inline VectorType INTRINSIC CONST round(VectorType a) { return a; }
415 template<> struct VectorHelper<unsigned int> {
416 typedef unsigned int EntryType;
417 typedef _M256I VectorType;
418 typedef unsigned long long ConcatType;
420 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
421 static inline VectorType INTRINSIC CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
422 static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
426 static inline VectorType INTRINSIC CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
429 static inline EntryType INTRINSIC CONST min(VectorType a) {
430 __m128i b = _mm_min_epu32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
431 b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
432 b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
433 return _mm_cvtsi128_si32(b);
435 static inline EntryType INTRINSIC CONST max(VectorType a) {
436 __m128i b = _mm_max_epu32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
437 b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
438 b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
439 return _mm_cvtsi128_si32(b);
441 static inline EntryType INTRINSIC CONST add(VectorType a) {
442 __m128i b = _mm_add_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
443 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
444 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
445 return _mm_cvtsi128_si32(b);
447 static inline EntryType INTRINSIC CONST mul(VectorType a) {
448 __m128i b = _mm_mullo_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
449 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
450 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
451 return _mm_cvtsi128_si32(b);
454 static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm256_mullo_epi32(a, b); }
458 static inline VectorType shiftLeft(VectorType a, int shift) {
459 return CAT(_mm256_slli_, SUFFIX)(a, shift);
461 static inline VectorType shiftRight(VectorType a, int shift) {
462 return CAT(_mm256_srli_, SUFFIX)(a, shift);
464 static inline VectorType INTRINSIC CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
465 static inline VectorType INTRINSIC CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d,
466 const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) {
467 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
471 static inline VectorType INTRINSIC CONST cmpneq(const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); }
473 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
474 static inline VectorType INTRINSIC CONST cmplt(const VectorType &a, const VectorType &b) {
475 return _mm256_cmplt_epu32(a, b);
477 static inline VectorType INTRINSIC CONST cmpgt(const VectorType &a, const VectorType &b) {
478 return _mm256_cmpgt_epu32(a, b);
484 static inline VectorType INTRINSIC CONST cmpnlt(const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); }
485 static inline VectorType INTRINSIC CONST cmple (const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); }
486 static inline VectorType INTRINSIC CONST cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
489 static inline VectorType INTRINSIC CONST round(VectorType a) { return a; }
492 template<> struct VectorHelper<signed short> {
493 typedef VectorTypeHelper<signed short>::Type VectorType;
494 typedef signed short EntryType;
495 typedef int ConcatType;
497 static inline VectorType INTRINSIC CONST or_(VectorType a, VectorType b) { return _mm_or_si128(a, b); }
498 static inline VectorType INTRINSIC CONST and_(VectorType a, VectorType b) { return _mm_and_si128(a, b); }
499 static inline VectorType INTRINSIC CONST xor_(VectorType a, VectorType b) { return _mm_xor_si128(a, b); }
500 static inline VectorType INTRINSIC CONST zero() { return _mm_setzero_si128(); }
501 static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, __m128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
504 static inline VectorType INTRINSIC CONST one() { return CAT(_mm_setone_, SUFFIX)(); }
506 static inline VectorType shiftLeft(VectorType a, int shift) {
507 return CAT(_mm_slli_, SUFFIX)(a, shift);
509 static inline VectorType shiftRight(VectorType a, int shift) {
510 return CAT(_mm_srai_, SUFFIX)(a, shift);
512 static inline VectorType INTRINSIC CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
513 static inline VectorType INTRINSIC CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
514 const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
515 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
518 static inline void INTRINSIC CONST multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) {
519 v1 = add(mul(v1, v2), v3); }
521 static inline VectorType INTRINSIC CONST abs(VectorType a) { return _mm_abs_epi16(a); }
522 static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm_mullo_epi16(a, b); }
523 static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return _mm_min_epi16(a, b); }
524 static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return _mm_max_epi16(a, b); }
526 static inline EntryType INTRINSIC CONST min(VectorType a) {
527 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
528 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
529 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
530 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
531 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
533 static inline EntryType INTRINSIC CONST max(VectorType a) {
534 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
535 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
536 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
537 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
538 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
540 static inline EntryType INTRINSIC CONST mul(VectorType a) {
541 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
542 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
543 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
544 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
546 static inline EntryType INTRINSIC CONST add(VectorType a) {
547 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
548 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
549 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
550 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
553 static inline VectorType INTRINSIC CONST add(VectorType a, VectorType b) { return _mm_add_epi16(a, b); }
554 static inline VectorType INTRINSIC CONST sub(VectorType a, VectorType b) { return _mm_sub_epi16(a, b); }
555 static inline VectorType INTRINSIC CONST cmpeq(VectorType a, VectorType b) { return _mm_cmpeq_epi16(a, b); }
556 static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epi16(a, b); }
557 static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epi16(a, b); }
558 static inline VectorType cmpneq(const VectorType &a, const VectorType &b) { __m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
559 static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) { __m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
560 static inline VectorType cmple (const VectorType &a, const VectorType &b) { __m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
561 static inline VectorType cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
563 static inline VectorType round(VectorType a) { return a; }
566 template<> struct VectorHelper<unsigned short> {
567 typedef VectorTypeHelper<unsigned short>::Type VectorType;
568 typedef unsigned short EntryType;
569 typedef unsigned int ConcatType;
571 static inline VectorType INTRINSIC CONST or_(VectorType a, VectorType b) { return _mm_or_si128(a, b); }
572 static inline VectorType INTRINSIC CONST and_(VectorType a, VectorType b) { return _mm_and_si128(a, b); }
573 static inline VectorType INTRINSIC CONST xor_(VectorType a, VectorType b) { return _mm_xor_si128(a, b); }
574 static inline VectorType INTRINSIC CONST zero() { return _mm_setzero_si128(); }
575 static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, __m128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
576 static inline VectorType INTRINSIC CONST one() { return _mm_setone_epu16(); }
578 static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm_mullo_epi16(a, b); }
579 static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return _mm_min_epu16(a, b); }
580 static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return _mm_max_epu16(a, b); }
583 static inline VectorType shiftLeft(VectorType a, int shift) {
584 return CAT(_mm_slli_, SUFFIX)(a, shift);
586 static inline VectorType shiftRight(VectorType a, int shift) {
587 return CAT(_mm_srli_, SUFFIX)(a, shift);
589 static inline EntryType INTRINSIC CONST min(VectorType a) {
590 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
591 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
592 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
593 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
594 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
596 static inline EntryType INTRINSIC CONST max(VectorType a) {
597 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
598 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
599 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
600 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
601 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
603 static inline EntryType INTRINSIC CONST mul(VectorType a) {
604 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
605 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
606 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
607 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
608 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
610 static inline EntryType INTRINSIC CONST add(VectorType a) {
611 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
612 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
613 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
614 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
615 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
617 static inline VectorType INTRINSIC CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
618 static inline VectorType INTRINSIC CONST set(const EntryType a, const EntryType b, const EntryType c,
619 const EntryType d, const EntryType e, const EntryType f,
620 const EntryType g, const EntryType h) {
621 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
624 static inline VectorType INTRINSIC CONST add(VectorType a, VectorType b) { return _mm_add_epi16(a, b); }
625 static inline VectorType INTRINSIC CONST sub(VectorType a, VectorType b) { return _mm_sub_epi16(a, b); }
626 static inline VectorType INTRINSIC CONST cmpeq(VectorType a, VectorType b) { return _mm_cmpeq_epi16(a, b); }
627 static inline VectorType cmpneq(const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
629 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
630 static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epu16(a, b); }
631 static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epu16(a, b); }
633 static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epi16(a, b); }
634 static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epi16(a, b); }
636 static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
637 static inline VectorType cmple (const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
638 static inline VectorType cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
640 static inline VectorType round(VectorType a) { return a; }
648 template<> struct VectorHelper<char>
650 typedef VectorTypeHelper<char>::Type VectorType;
651 typedef char EntryType;
652 typedef short ConcatType;
655 template<> struct VectorHelper<unsigned char>
657 typedef VectorTypeHelper<unsigned char>::Type VectorType;
658 typedef unsigned char EntryType;
659 typedef unsigned short ConcatType;
665 #include "vectorhelper.tcc"
667 #endif // AVX_VECTORHELPER_H