]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/include/Vc/avx/vectorhelper.h
Vc package added (version 0.6.79-dev)
[u/mrichter/AliRoot.git] / Vc / include / Vc / avx / vectorhelper.h
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
20#ifndef AVX_VECTORHELPER_H
21#define AVX_VECTORHELPER_H
22
23#include <limits>
24#include "types.h"
25#include "intrinsics.h"
26#include "casts.h"
27
28namespace Vc
29{
30namespace AVX
31{
32#define OP0(name, code) static inline VectorType name() { return code; }
33#define OP1(name, code) static inline VectorType name(const VectorType &a) { return code; }
34#define OP2(name, code) static inline VectorType name(const VectorType &a, const VectorType &b) { return code; }
35#define OP3(name, code) static inline VectorType name(const VectorType &a, const VectorType &b, const VectorType &c) { return code; }
36
37 template<> struct VectorHelper<_M256>
38 {
39 typedef _M256 VectorType;
40 template<typename A> static VectorType load(const float *x, A) PURE;
41 static void store(float *mem, const VectorType x, AlignedFlag);
42 static void store(float *mem, const VectorType x, UnalignedFlag);
43 static void store(float *mem, const VectorType x, StreamingAndAlignedFlag);
44 static void store(float *mem, const VectorType x, StreamingAndUnalignedFlag);
45 static void store(float *mem, const VectorType x, const VectorType m, AlignedFlag);
46 static void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag);
47 static void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
48 static void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
49
50 static VectorType cdab(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); }
51 static VectorType badc(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); }
52 static VectorType aaaa(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); }
53 static VectorType bbbb(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); }
54 static VectorType cccc(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); }
55 static VectorType dddd(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); }
56 static VectorType dacb(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); }
57
58 OP0(allone, _mm256_setallone_ps())
59 OP0(zero, _mm256_setzero_ps())
60 OP2(or_, _mm256_or_ps(a, b))
61 OP2(xor_, _mm256_xor_ps(a, b))
62 OP2(and_, _mm256_and_ps(a, b))
63 OP2(andnot_, _mm256_andnot_ps(a, b))
64 OP3(blend, _mm256_blendv_ps(a, b, c))
65 };
66
67 template<> struct VectorHelper<_M256D>
68 {
69 typedef _M256D VectorType;
70 template<typename A> static VectorType load(const double *x, A) PURE;
71 static void store(double *mem, const VectorType x, AlignedFlag);
72 static void store(double *mem, const VectorType x, UnalignedFlag);
73 static void store(double *mem, const VectorType x, StreamingAndAlignedFlag);
74 static void store(double *mem, const VectorType x, StreamingAndUnalignedFlag);
75 static void store(double *mem, const VectorType x, const VectorType m, AlignedFlag);
76 static void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag);
77 static void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
78 static void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
79
80 static VectorType cdab(VectorType x) { return _mm256_permute_pd(x, 5); }
81 static VectorType badc(VectorType x) { return _mm256_permute2f128_pd(x, x, 1); }
82 // aaaa bbbb cccc dddd specialized in vector.tcc
83 static VectorType dacb(VectorType x) {
84 const __m128d cb = avx_cast<__m128d>(_mm_alignr_epi8(avx_cast<__m128i>(lo128(x)),
85 avx_cast<__m128i>(hi128(x)), sizeof(double))); // XXX: lo and hi swapped?
86 const __m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped?
87 return concat(cb, da);
88 }
89
90 OP0(allone, _mm256_setallone_pd())
91 OP0(zero, _mm256_setzero_pd())
92 OP2(or_, _mm256_or_pd(a, b))
93 OP2(xor_, _mm256_xor_pd(a, b))
94 OP2(and_, _mm256_and_pd(a, b))
95 OP2(andnot_, _mm256_andnot_pd(a, b))
96 OP3(blend, _mm256_blendv_pd(a, b, c))
97 };
98
99 template<> struct VectorHelper<_M256I>
100 {
101 typedef _M256I VectorType;
102 template<typename T> static VectorType load(const T *x, AlignedFlag) PURE;
103 template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE;
104 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE;
105 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE;
106 template<typename T> static void store(T *mem, const VectorType x, AlignedFlag);
107 template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag);
108 template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag);
109 template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag);
110 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag);
111 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag);
112 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
113 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
114
115 static VectorType cdab(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(2, 3, 0, 1))); }
116 static VectorType badc(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(1, 0, 3, 2))); }
117 static VectorType aaaa(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(0, 0, 0, 0))); }
118 static VectorType bbbb(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(1, 1, 1, 1))); }
119 static VectorType cccc(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(2, 2, 2, 2))); }
120 static VectorType dddd(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(3, 3, 3, 3))); }
121 static VectorType dacb(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(3, 0, 2, 1))); }
122
123 OP0(allone, _mm256_setallone_si256())
124 OP0(zero, _mm256_setzero_si256())
125 OP2(or_, _mm256_or_si256(a, b))
126 OP2(xor_, _mm256_xor_si256(a, b))
127 OP2(and_, _mm256_and_si256(a, b))
128 OP2(andnot_, _mm256_andnot_si256(a, b))
129 OP3(blend, _mm256_blendv_epi8(a, b, c))
130 };
131
132 template<> struct VectorHelper<__m128i>
133 {
134 typedef __m128i VectorType;
135 template<typename T> static VectorType load(const T *x, AlignedFlag) PURE;
136 template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE;
137 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE;
138 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE;
139 template<typename T> static void store(T *mem, const VectorType x, AlignedFlag);
140 template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag);
141 template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag);
142 template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag);
143 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag);
144 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag);
145 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
146 template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
147
148 static VectorType cdab(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); }
149 static VectorType badc(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)), _MM_SHUFFLE(1, 0, 3, 2)); }
150 static VectorType aaaa(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0)); }
151 static VectorType bbbb(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)), _MM_SHUFFLE(1, 1, 1, 1)); }
152 static VectorType cccc(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(2, 2, 2, 2)); }
153 static VectorType dddd(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); }
154 static VectorType dacb(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)), _MM_SHUFFLE(3, 0, 2, 1)); }
155
156 OP0(allone, _mm_setallone_si128())
157 OP0(zero, _mm_setzero_si128())
158 OP2(or_, _mm_or_si128(a, b))
159 OP2(xor_, _mm_xor_si128(a, b))
160 OP2(and_, _mm_and_si128(a, b))
161 OP2(andnot_, _mm_andnot_si128(a, b))
162 OP3(blend, _mm_blendv_epi8(a, b, c))
163 };
164#undef OP1
165#undef OP2
166#undef OP3
167
168#define OP1(op) \
169 static inline VectorType INTRINSIC CONST op(const VectorType &a) { return CAT(_mm256_##op##_, SUFFIX)(a); }
170#define OP(op) \
171 static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); }
172#define OP_(op) \
173 static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op , SUFFIX)(a, b); }
174#define OPx(op, op2) \
175 static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); }
176#define OPcmp(op) \
177 static inline VectorType INTRINSIC CONST cmp##op(const VectorType &a, const VectorType &b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); }
178#define OP_CAST_(op) \
179 static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_castps_, SUFFIX)( \
180 _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \
181 CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \
182 }
183#define MINMAX \
184 static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \
185 static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return CAT(_mm256_max_, SUFFIX)(a, b); }
186
187 template<> struct VectorHelper<double> {
188 typedef _M256D VectorType;
189 typedef double EntryType;
190 typedef double ConcatType;
191#define SUFFIX pd
192
193 static inline VectorType notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); }
194 static inline VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); }
195 static inline VectorType set(const double a, const double b, const double c, const double d) {
196 return CAT(_mm256_set_, SUFFIX)(a, b, c, d);
197 }
198 static inline VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
199 static inline VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); }
200
201 static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
202 static inline VectorType mul(VectorType a, VectorType b, _M256 _mask) {
203 _M256D mask = _mm256_castps_pd(_mask);
204 return _mm256_or_pd(
205 _mm256_and_pd(mask, _mm256_mul_pd(a, b)),
206 _mm256_andnot_pd(mask, a)
207 );
208 }
209 static inline VectorType div(VectorType a, VectorType b, _M256 _mask) {
210 _M256D mask = _mm256_castps_pd(_mask);
211 return _mm256_or_pd(
212 _mm256_and_pd(mask, _mm256_div_pd(a, b)),
213 _mm256_andnot_pd(mask, a)
214 );
215 }
216
217 OP(add) OP(sub) OP(mul)
218 OPcmp(eq) OPcmp(neq)
219 OPcmp(lt) OPcmp(nlt)
220 OPcmp(le) OPcmp(nle)
221
222 OP1(sqrt)
223 static inline VectorType rsqrt(VectorType x) {
224 return _mm256_div_pd(one(), sqrt(x));
225 }
226 static inline VectorType reciprocal(VectorType x) {
227 return _mm256_div_pd(one(), x);
228 }
229 static inline VectorType isNaN(VectorType x) {
230 return _mm256_cmpunord_pd(x, x);
231 }
232 static inline VectorType isFinite(VectorType x) {
233 return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x));
234 }
235 static inline VectorType abs(const VectorType a) {
236 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd());
237 }
238
239 MINMAX
240 static inline EntryType min(VectorType a) {
241 __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
242 b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
243 return _mm_cvtsd_f64(b);
244 }
245 static inline EntryType max(VectorType a) {
246 __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
247 b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
248 return _mm_cvtsd_f64(b);
249 }
250 static inline EntryType mul(VectorType a) {
251 __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
252 b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
253 return _mm_cvtsd_f64(b);
254 }
255 static inline EntryType add(VectorType a) {
256 __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
257 b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
258 return _mm_cvtsd_f64(b);
259 }
260#undef SUFFIX
261 static inline VectorType round(VectorType a) {
262 return _mm256_round_pd(a, _MM_FROUND_NINT);
263 }
264 };
265
266 template<> struct VectorHelper<float> {
267 typedef float EntryType;
268 typedef _M256 VectorType;
269 typedef double ConcatType;
270#define SUFFIX ps
271
272 static inline VectorType notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); }
273 static inline VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); }
274 static inline VectorType set(const float a, const float b, const float c, const float d,
275 const float e, const float f, const float g, const float h) {
276 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
277 static inline VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
278 static inline VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); }
279 static inline _M256 concat(_M256D a, _M256D b) { return _mm256_insertf128_ps(avx_cast<_M256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
280
281 static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
282 static inline VectorType mul(VectorType a, VectorType b, _M256 mask) {
283 return _mm256_or_ps(
284 _mm256_and_ps(mask, _mm256_mul_ps(a, b)),
285 _mm256_andnot_ps(mask, a)
286 );
287 }
288 static inline VectorType div(VectorType a, VectorType b, _M256 mask) {
289 return _mm256_or_ps(
290 _mm256_and_ps(mask, _mm256_div_ps(a, b)),
291 _mm256_andnot_ps(mask, a)
292 );
293 }
294
295 OP(add) OP(sub) OP(mul)
296 OPcmp(eq) OPcmp(neq)
297 OPcmp(lt) OPcmp(nlt)
298 OPcmp(le) OPcmp(nle)
299
300 OP1(sqrt) OP1(rsqrt)
301 static inline VectorType isNaN(VectorType x) {
302 return _mm256_cmpunord_ps(x, x);
303 }
304 static inline VectorType isFinite(VectorType x) {
305 return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x));
306 }
307 static inline VectorType reciprocal(VectorType x) {
308 return _mm256_rcp_ps(x);
309 }
310 static inline VectorType abs(const VectorType a) {
311 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps());
312 }
313
314 MINMAX
315 static inline EntryType min(VectorType a) {
316 __m128 b = _mm_min_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
317 b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
318 b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
319 return _mm_cvtss_f32(b);
320 }
321 static inline EntryType max(VectorType a) {
322 __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
323 b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
324 b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
325 return _mm_cvtss_f32(b);
326 }
327 static inline EntryType mul(VectorType a) {
328 __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
329 b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
330 b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
331 return _mm_cvtss_f32(b);
332 }
333 static inline EntryType add(VectorType a) {
334 __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
335 b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
336 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
337 return _mm_cvtss_f32(b);
338 }
339#undef SUFFIX
340 static inline VectorType round(VectorType a) {
341 return _mm256_round_ps(a, _MM_FROUND_NINT);
342 }
343 };
344
345 template<> struct VectorHelper<sfloat> : public VectorHelper<float> {};
346
347 template<> struct VectorHelper<int> {
348 typedef int EntryType;
349 typedef _M256I VectorType;
350 typedef long long ConcatType;
351#define SUFFIX si256
352
353 OP_(or_) OP_(and_) OP_(xor_)
354 static inline VectorType INTRINSIC CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
355 static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
356#undef SUFFIX
357#define SUFFIX epi32
358 static inline VectorType INTRINSIC CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
359
360 static inline VectorType INTRINSIC CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
361 static inline VectorType INTRINSIC CONST set(const int a, const int b, const int c, const int d,
362 const int e, const int f, const int g, const int h) {
363 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
364
365 static inline void INTRINSIC CONST multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
366
367 static inline VectorType shiftLeft(VectorType a, int shift) {
368 return CAT(_mm256_slli_, SUFFIX)(a, shift);
369 }
370 static inline VectorType shiftRight(VectorType a, int shift) {
371 return CAT(_mm256_srai_, SUFFIX)(a, shift);
372 }
373 OP1(abs)
374
375 MINMAX
376 static inline EntryType INTRINSIC CONST min(VectorType a) {
377 __m128i b = _mm_min_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
378 b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
379 b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
380 return _mm_cvtsi128_si32(b);
381 }
382 static inline EntryType INTRINSIC CONST max(VectorType a) {
383 __m128i b = _mm_max_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
384 b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
385 b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
386 return _mm_cvtsi128_si32(b);
387 }
388 static inline EntryType INTRINSIC CONST add(VectorType a) {
389 __m128i b = _mm_add_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
390 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
391 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
392 return _mm_cvtsi128_si32(b);
393 }
394 static inline EntryType INTRINSIC CONST mul(VectorType a) {
395 __m128i b = _mm_mullo_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
396 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
397 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
398 return _mm_cvtsi128_si32(b);
399 }
400
401 static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm256_mullo_epi32(a, b); }
402
403 OP(add) OP(sub)
404 OPcmp(eq)
405 OPcmp(lt)
406 OPcmp(gt)
407 static inline VectorType INTRINSIC CONST cmpneq(const VectorType &a, const VectorType &b) { _M256I x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
408 static inline VectorType INTRINSIC CONST cmpnlt(const VectorType &a, const VectorType &b) { _M256I x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
409 static inline VectorType INTRINSIC CONST cmple (const VectorType &a, const VectorType &b) { _M256I x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
410 static inline VectorType INTRINSIC CONST cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
411#undef SUFFIX
412 static inline VectorType INTRINSIC CONST round(VectorType a) { return a; }
413 };
414
415 template<> struct VectorHelper<unsigned int> {
416 typedef unsigned int EntryType;
417 typedef _M256I VectorType;
418 typedef unsigned long long ConcatType;
419#define SUFFIX si256
420 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
421 static inline VectorType INTRINSIC CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
422 static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
423
424#undef SUFFIX
425#define SUFFIX epu32
426 static inline VectorType INTRINSIC CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
427
428 MINMAX
429 static inline EntryType INTRINSIC CONST min(VectorType a) {
430 __m128i b = _mm_min_epu32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
431 b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
432 b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
433 return _mm_cvtsi128_si32(b);
434 }
435 static inline EntryType INTRINSIC CONST max(VectorType a) {
436 __m128i b = _mm_max_epu32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
437 b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
438 b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
439 return _mm_cvtsi128_si32(b);
440 }
441 static inline EntryType INTRINSIC CONST add(VectorType a) {
442 __m128i b = _mm_add_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
443 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
444 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
445 return _mm_cvtsi128_si32(b);
446 }
447 static inline EntryType INTRINSIC CONST mul(VectorType a) {
448 __m128i b = _mm_mullo_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
449 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
450 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
451 return _mm_cvtsi128_si32(b);
452 }
453
454 static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm256_mullo_epi32(a, b); }
455
456#undef SUFFIX
457#define SUFFIX epi32
458 static inline VectorType shiftLeft(VectorType a, int shift) {
459 return CAT(_mm256_slli_, SUFFIX)(a, shift);
460 }
461 static inline VectorType shiftRight(VectorType a, int shift) {
462 return CAT(_mm256_srli_, SUFFIX)(a, shift);
463 }
464 static inline VectorType INTRINSIC CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
465 static inline VectorType INTRINSIC CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d,
466 const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) {
467 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
468
469 OP(add) OP(sub)
470 OPcmp(eq)
471 static inline VectorType INTRINSIC CONST cmpneq(const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); }
472
473#ifndef USE_INCORRECT_UNSIGNED_COMPARE
474 static inline VectorType INTRINSIC CONST cmplt(const VectorType &a, const VectorType &b) {
475 return _mm256_cmplt_epu32(a, b);
476 }
477 static inline VectorType INTRINSIC CONST cmpgt(const VectorType &a, const VectorType &b) {
478 return _mm256_cmpgt_epu32(a, b);
479 }
480#else
481 OPcmp(lt)
482 OPcmp(gt)
483#endif
484 static inline VectorType INTRINSIC CONST cmpnlt(const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); }
485 static inline VectorType INTRINSIC CONST cmple (const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); }
486 static inline VectorType INTRINSIC CONST cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
487
488#undef SUFFIX
489 static inline VectorType INTRINSIC CONST round(VectorType a) { return a; }
490 };
491
492 template<> struct VectorHelper<signed short> {
493 typedef VectorTypeHelper<signed short>::Type VectorType;
494 typedef signed short EntryType;
495 typedef int ConcatType;
496
497 static inline VectorType INTRINSIC CONST or_(VectorType a, VectorType b) { return _mm_or_si128(a, b); }
498 static inline VectorType INTRINSIC CONST and_(VectorType a, VectorType b) { return _mm_and_si128(a, b); }
499 static inline VectorType INTRINSIC CONST xor_(VectorType a, VectorType b) { return _mm_xor_si128(a, b); }
500 static inline VectorType INTRINSIC CONST zero() { return _mm_setzero_si128(); }
501 static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, __m128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
502
503#define SUFFIX epi16
504 static inline VectorType INTRINSIC CONST one() { return CAT(_mm_setone_, SUFFIX)(); }
505
506 static inline VectorType shiftLeft(VectorType a, int shift) {
507 return CAT(_mm_slli_, SUFFIX)(a, shift);
508 }
509 static inline VectorType shiftRight(VectorType a, int shift) {
510 return CAT(_mm_srai_, SUFFIX)(a, shift);
511 }
512 static inline VectorType INTRINSIC CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
513 static inline VectorType INTRINSIC CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
514 const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
515 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
516 }
517
518 static inline void INTRINSIC CONST multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) {
519 v1 = add(mul(v1, v2), v3); }
520
521 static inline VectorType INTRINSIC CONST abs(VectorType a) { return _mm_abs_epi16(a); }
522 static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm_mullo_epi16(a, b); }
523 static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return _mm_min_epi16(a, b); }
524 static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return _mm_max_epi16(a, b); }
525
526 static inline EntryType INTRINSIC CONST min(VectorType a) {
527 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
528 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
529 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
530 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
531 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
532 }
533 static inline EntryType INTRINSIC CONST max(VectorType a) {
534 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
535 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
536 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
537 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
538 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
539 }
540 static inline EntryType INTRINSIC CONST mul(VectorType a) {
541 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
542 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
543 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
544 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
545 }
546 static inline EntryType INTRINSIC CONST add(VectorType a) {
547 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
548 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
549 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
550 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
551 }
552
553 static inline VectorType INTRINSIC CONST add(VectorType a, VectorType b) { return _mm_add_epi16(a, b); }
554 static inline VectorType INTRINSIC CONST sub(VectorType a, VectorType b) { return _mm_sub_epi16(a, b); }
555 static inline VectorType INTRINSIC CONST cmpeq(VectorType a, VectorType b) { return _mm_cmpeq_epi16(a, b); }
556 static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epi16(a, b); }
557 static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epi16(a, b); }
558 static inline VectorType cmpneq(const VectorType &a, const VectorType &b) { __m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
559 static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) { __m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
560 static inline VectorType cmple (const VectorType &a, const VectorType &b) { __m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
561 static inline VectorType cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
562#undef SUFFIX
563 static inline VectorType round(VectorType a) { return a; }
564 };
565
566 template<> struct VectorHelper<unsigned short> {
567 typedef VectorTypeHelper<unsigned short>::Type VectorType;
568 typedef unsigned short EntryType;
569 typedef unsigned int ConcatType;
570
571 static inline VectorType INTRINSIC CONST or_(VectorType a, VectorType b) { return _mm_or_si128(a, b); }
572 static inline VectorType INTRINSIC CONST and_(VectorType a, VectorType b) { return _mm_and_si128(a, b); }
573 static inline VectorType INTRINSIC CONST xor_(VectorType a, VectorType b) { return _mm_xor_si128(a, b); }
574 static inline VectorType INTRINSIC CONST zero() { return _mm_setzero_si128(); }
575 static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, __m128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
576 static inline VectorType INTRINSIC CONST one() { return _mm_setone_epu16(); }
577
578 static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm_mullo_epi16(a, b); }
579 static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return _mm_min_epu16(a, b); }
580 static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return _mm_max_epu16(a, b); }
581
582#define SUFFIX epi16
583 static inline VectorType shiftLeft(VectorType a, int shift) {
584 return CAT(_mm_slli_, SUFFIX)(a, shift);
585 }
586 static inline VectorType shiftRight(VectorType a, int shift) {
587 return CAT(_mm_srli_, SUFFIX)(a, shift);
588 }
589 static inline EntryType INTRINSIC CONST min(VectorType a) {
590 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
591 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
592 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
593 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
594 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
595 }
596 static inline EntryType INTRINSIC CONST max(VectorType a) {
597 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
598 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
599 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
600 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
601 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
602 }
603 static inline EntryType INTRINSIC CONST mul(VectorType a) {
604 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
605 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
606 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
607 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
608 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
609 }
610 static inline EntryType INTRINSIC CONST add(VectorType a) {
611 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
612 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
613 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
614 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
615 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
616 }
617 static inline VectorType INTRINSIC CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
618 static inline VectorType INTRINSIC CONST set(const EntryType a, const EntryType b, const EntryType c,
619 const EntryType d, const EntryType e, const EntryType f,
620 const EntryType g, const EntryType h) {
621 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
622 }
623
624 static inline VectorType INTRINSIC CONST add(VectorType a, VectorType b) { return _mm_add_epi16(a, b); }
625 static inline VectorType INTRINSIC CONST sub(VectorType a, VectorType b) { return _mm_sub_epi16(a, b); }
626 static inline VectorType INTRINSIC CONST cmpeq(VectorType a, VectorType b) { return _mm_cmpeq_epi16(a, b); }
627 static inline VectorType cmpneq(const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
628
629#ifndef USE_INCORRECT_UNSIGNED_COMPARE
630 static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epu16(a, b); }
631 static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epu16(a, b); }
632#else
633 static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epi16(a, b); }
634 static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epi16(a, b); }
635#endif
636 static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
637 static inline VectorType cmple (const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
638 static inline VectorType cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
639#undef SUFFIX
640 static inline VectorType round(VectorType a) { return a; }
641 };
642#undef OP1
643#undef OP
644#undef OP_
645#undef OPx
646#undef OPcmp
647
648template<> struct VectorHelper<char>
649{
650 typedef VectorTypeHelper<char>::Type VectorType;
651 typedef char EntryType;
652 typedef short ConcatType;
653};
654
655template<> struct VectorHelper<unsigned char>
656{
657 typedef VectorTypeHelper<unsigned char>::Type VectorType;
658 typedef unsigned char EntryType;
659 typedef unsigned short ConcatType;
660};
661
662} // namespace AVX
663} // namespace Vc
664
665#include "vectorhelper.tcc"
666
667#endif // AVX_VECTORHELPER_H