1 /* This file is part of the Vc library.
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
20 #ifndef AVX_VECTORHELPER_H
21 #define AVX_VECTORHELPER_H
25 #include "intrinsics.h"
37 Vc_INTRINSIC Vc_CONST m256 exponent(param256 v)
39 m128i tmp0 = _mm_srli_epi32(avx_cast<m128i>(v), 23);
40 m128i tmp1 = _mm_srli_epi32(avx_cast<m128i>(hi128(v)), 23);
41 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
42 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
43 return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
45 Vc_INTRINSIC Vc_CONST m256d exponent(param256d v)
47 m128i tmp0 = _mm_srli_epi64(avx_cast<m128i>(v), 52);
48 m128i tmp1 = _mm_srli_epi64(avx_cast<m128i>(hi128(v)), 52);
49 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
50 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
51 return _mm256_cvtepi32_pd(avx_cast<m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<m128>(tmp0), avx_cast<m128>(tmp1))));
53 } // namespace Internal
55 #define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
56 #define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a) { return code; }
57 #define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b) { return code; }
58 #define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b, VTArg c) { return code; }
60 template<> struct VectorHelper<m256>
62 typedef m256 VectorType;
63 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
64 typedef const VectorType & VTArg;
66 typedef const VectorType VTArg;
68 template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
69 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R;
70 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
71 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
72 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
73 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R;
74 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
75 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
76 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
78 static Vc_ALWAYS_INLINE Vc_CONST VectorType cdab(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); }
79 static Vc_ALWAYS_INLINE Vc_CONST VectorType badc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); }
80 static Vc_ALWAYS_INLINE Vc_CONST VectorType aaaa(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); }
81 static Vc_ALWAYS_INLINE Vc_CONST VectorType bbbb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); }
82 static Vc_ALWAYS_INLINE Vc_CONST VectorType cccc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); }
83 static Vc_ALWAYS_INLINE Vc_CONST VectorType dddd(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); }
84 static Vc_ALWAYS_INLINE Vc_CONST VectorType dacb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); }
86 OP0(allone, _mm256_setallone_ps())
87 OP0(zero, _mm256_setzero_ps())
88 OP2(or_, _mm256_or_ps(a, b))
89 OP2(xor_, _mm256_xor_ps(a, b))
90 OP2(and_, _mm256_and_ps(a, b))
91 OP2(andnot_, _mm256_andnot_ps(a, b))
92 OP3(blend, _mm256_blendv_ps(a, b, c))
95 template<> struct VectorHelper<m256d>
97 typedef m256d VectorType;
98 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
99 typedef const VectorType & VTArg;
101 typedef const VectorType VTArg;
103 template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
104 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R;
105 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
106 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
107 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
108 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R;
109 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
110 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
111 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
113 static VectorType cdab(VTArg x) { return _mm256_permute_pd(x, 5); }
114 static VectorType badc(VTArg x) { return _mm256_permute2f128_pd(x, x, 1); }
115 // aaaa bbbb cccc dddd specialized in vector.tcc
116 static VectorType dacb(VTArg x) {
117 const m128d cb = avx_cast<m128d>(_mm_alignr_epi8(avx_cast<m128i>(lo128(x)),
118 avx_cast<m128i>(hi128(x)), sizeof(double))); // XXX: lo and hi swapped?
119 const m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped?
120 return concat(cb, da);
123 OP0(allone, _mm256_setallone_pd())
124 OP0(zero, _mm256_setzero_pd())
125 OP2(or_, _mm256_or_pd(a, b))
126 OP2(xor_, _mm256_xor_pd(a, b))
127 OP2(and_, _mm256_and_pd(a, b))
128 OP2(andnot_, _mm256_andnot_pd(a, b))
129 OP3(blend, _mm256_blendv_pd(a, b, c))
132 template<> struct VectorHelper<m256i>
134 typedef m256i VectorType;
135 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
136 typedef const VectorType & VTArg;
138 typedef const VectorType VTArg;
140 template<typename T> static VectorType load(const T *x, AlignedFlag) Vc_PURE;
141 template<typename T> static VectorType load(const T *x, UnalignedFlag) Vc_PURE;
142 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE;
143 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE;
144 template<typename T> static void store(T *mem, VTArg x, AlignedFlag);
145 template<typename T> static void store(T *mem, VTArg x, UnalignedFlag);
146 template<typename T> static void store(T *mem, VTArg x, StreamingAndAlignedFlag);
147 template<typename T> static void store(T *mem, VTArg x, StreamingAndUnalignedFlag);
148 template<typename T> static void store(T *mem, VTArg x, VTArg m, AlignedFlag);
149 template<typename T> static void store(T *mem, VTArg x, VTArg m, UnalignedFlag);
150 template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag);
151 template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag);
153 static VectorType cdab(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(2, 3, 0, 1))); }
154 static VectorType badc(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(1, 0, 3, 2))); }
155 static VectorType aaaa(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(0, 0, 0, 0))); }
156 static VectorType bbbb(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(1, 1, 1, 1))); }
157 static VectorType cccc(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(2, 2, 2, 2))); }
158 static VectorType dddd(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(3, 3, 3, 3))); }
159 static VectorType dacb(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(3, 0, 2, 1))); }
161 OP0(allone, _mm256_setallone_si256())
162 OP0(zero, _mm256_setzero_si256())
163 OP2(or_, _mm256_or_si256(a, b))
164 OP2(xor_, _mm256_xor_si256(a, b))
165 OP2(and_, _mm256_and_si256(a, b))
166 OP2(andnot_, _mm256_andnot_si256(a, b))
167 OP3(blend, _mm256_blendv_epi8(a, b, c))
170 template<> struct VectorHelper<m128i>
172 typedef m128i VectorType;
173 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
174 typedef const VectorType & VTArg;
176 typedef const VectorType VTArg;
178 template<typename T> static VectorType load(const T *x, AlignedFlag) Vc_PURE;
179 template<typename T> static VectorType load(const T *x, UnalignedFlag) Vc_PURE;
180 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE;
181 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE;
182 template<typename T> static void store(T *mem, VTArg x, AlignedFlag);
183 template<typename T> static void store(T *mem, VTArg x, UnalignedFlag);
184 template<typename T> static void store(T *mem, VTArg x, StreamingAndAlignedFlag);
185 template<typename T> static void store(T *mem, VTArg x, StreamingAndUnalignedFlag);
186 template<typename T> static void store(T *mem, VTArg x, VTArg m, AlignedFlag);
187 template<typename T> static void store(T *mem, VTArg x, VTArg m, UnalignedFlag);
188 template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag);
189 template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag);
191 static VectorType cdab(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); }
192 static VectorType badc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)); }
193 static VectorType aaaa(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(0, 0, 0, 0)); }
194 static VectorType bbbb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 1, 1, 1)); }
195 static VectorType cccc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 2, 2, 2)); }
196 static VectorType dddd(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 3, 3, 3)); }
197 static VectorType dacb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 0, 2, 1)); }
199 OP0(allone, _mm_setallone_si128())
200 OP0(zero, _mm_setzero_si128())
201 OP2(or_, _mm_or_si128(a, b))
202 OP2(xor_, _mm_xor_si128(a, b))
203 OP2(and_, _mm_and_si128(a, b))
204 OP2(andnot_, _mm_andnot_si128(a, b))
205 OP3(blend, _mm_blendv_epi8(a, b, c))
212 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return CAT(_mm256_##op##_, SUFFIX)(a); }
214 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); }
216 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op , SUFFIX)(a, b); }
217 #define OPx(op, op2) \
218 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); }
220 static Vc_INTRINSIC VectorType Vc_CONST cmp##op(VTArg a, VTArg b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); }
221 #define OP_CAST_(op) \
222 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_castps_, SUFFIX)( \
223 _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \
224 CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \
227 static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \
228 static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return CAT(_mm256_max_, SUFFIX)(a, b); }
230 template<> struct VectorHelper<double> {
231 typedef m256d VectorType;
232 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
233 typedef const VectorType & VTArg;
235 typedef const VectorType VTArg;
237 typedef double EntryType;
238 typedef double ConcatType;
241 static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); }
242 static Vc_ALWAYS_INLINE VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); }
243 static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
244 return CAT(_mm256_set_, SUFFIX)(a, b, c, d);
246 static Vc_ALWAYS_INLINE VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
247 static Vc_ALWAYS_INLINE VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); }
249 static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
251 v1 = _mm256_macc_pd(v1, v2, v3);
253 VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
254 VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
255 #if defined(VC_GCC) && VC_GCC < 0x40703
256 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
257 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
258 asm("":"+x"(h1), "+x"(h2));
260 const VectorType l1 = _mm256_sub_pd(v1, h1);
261 const VectorType l2 = _mm256_sub_pd(v2, h2);
262 const VectorType ll = mul(l1, l2);
263 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
264 const VectorType hh = mul(h1, h2);
265 // ll < lh < hh for all entries is certain
266 const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3|
267 const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
268 const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
269 v1 = add(add(ll, b), add(c, hh));
273 OP(add) OP(sub) OP(mul)
279 static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
280 return _mm256_div_pd(one(), sqrt(x));
282 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
283 return _mm256_div_pd(one(), x);
285 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) {
286 return _mm256_cmpunord_pd(x, x);
288 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) {
289 return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x));
291 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
292 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd());
296 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
297 m128d b = _mm_min_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
298 b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
299 return _mm_cvtsd_f64(b);
301 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
302 m128d b = _mm_max_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
303 b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
304 return _mm_cvtsd_f64(b);
306 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
307 m128d b = _mm_mul_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
308 b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
309 return _mm_cvtsd_f64(b);
311 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
312 m128d b = _mm_add_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
313 b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
314 return _mm_cvtsd_f64(b);
317 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
318 return _mm256_round_pd(a, _MM_FROUND_NINT);
322 template<> struct VectorHelper<float> {
323 typedef float EntryType;
324 typedef m256 VectorType;
325 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
326 typedef const VectorType & VTArg;
328 typedef const VectorType VTArg;
330 typedef double ConcatType;
333 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); }
334 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); }
335 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
336 const float e, const float f, const float g, const float h) {
337 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
338 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
339 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); }
340 static Vc_ALWAYS_INLINE Vc_CONST m256 concat(param256d a, param256d b) { return _mm256_insertf128_ps(avx_cast<m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
342 static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
344 v1 = _mm256_macc_ps(v1, v2, v3);
346 m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
347 m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
348 m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
349 m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
350 m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
351 m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
353 _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
354 _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
358 OP(add) OP(sub) OP(mul)
364 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) {
365 return _mm256_cmpunord_ps(x, x);
367 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) {
368 return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x));
370 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
371 return _mm256_rcp_ps(x);
373 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
374 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps());
378 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
379 m128 b = _mm_min_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
380 b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
381 b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
382 return _mm_cvtss_f32(b);
384 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
385 m128 b = _mm_max_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
386 b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
387 b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
388 return _mm_cvtss_f32(b);
390 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
391 m128 b = _mm_mul_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
392 b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
393 b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
394 return _mm_cvtss_f32(b);
396 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
397 m128 b = _mm_add_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
398 b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
399 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
400 return _mm_cvtss_f32(b);
403 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
404 return _mm256_round_ps(a, _MM_FROUND_NINT);
408 template<> struct VectorHelper<sfloat> : public VectorHelper<float> {};
410 template<> struct VectorHelper<int> {
411 typedef int EntryType;
412 typedef m256i VectorType;
413 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
414 typedef const VectorType & VTArg;
416 typedef const VectorType VTArg;
418 typedef long long ConcatType;
421 OP_(or_) OP_(and_) OP_(xor_)
422 static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
423 static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
426 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
428 static Vc_INTRINSIC VectorType Vc_CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
429 static Vc_INTRINSIC VectorType Vc_CONST set(const int a, const int b, const int c, const int d,
430 const int e, const int f, const int g, const int h) {
431 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
433 static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
435 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
436 return CAT(_mm256_slli_, SUFFIX)(a, shift);
438 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
439 return CAT(_mm256_srai_, SUFFIX)(a, shift);
444 static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) {
445 m128i b = _mm_min_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
446 b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
447 b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
448 return _mm_cvtsi128_si32(b);
450 static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) {
451 m128i b = _mm_max_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
452 b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
453 b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
454 return _mm_cvtsi128_si32(b);
456 static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) {
457 m128i b = _mm_add_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
458 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
459 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
460 return _mm_cvtsi128_si32(b);
462 static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) {
463 m128i b = _mm_mullo_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
464 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
465 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
466 return _mm_cvtsi128_si32(b);
469 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); }
475 static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { m256i x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
476 static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { m256i x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
477 static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { m256i x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
478 static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
480 static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; }
483 template<> struct VectorHelper<unsigned int> {
484 typedef unsigned int EntryType;
485 typedef m256i VectorType;
486 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
487 typedef const VectorType & VTArg;
489 typedef const VectorType VTArg;
491 typedef unsigned long long ConcatType;
493 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
494 static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
495 static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
499 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
502 static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) {
503 m128i b = _mm_min_epu32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
504 b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
505 b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
506 return _mm_cvtsi128_si32(b);
508 static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) {
509 m128i b = _mm_max_epu32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
510 b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
511 b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
512 return _mm_cvtsi128_si32(b);
514 static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) {
515 m128i b = _mm_add_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
516 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
517 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
518 return _mm_cvtsi128_si32(b);
520 static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) {
521 m128i b = _mm_mullo_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
522 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
523 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
524 return _mm_cvtsi128_si32(b);
527 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); }
528 static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
532 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
533 return CAT(_mm256_slli_, SUFFIX)(a, shift);
535 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
536 return CAT(_mm256_srli_, SUFFIX)(a, shift);
538 static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
539 static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d,
540 const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) {
541 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
545 static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); }
547 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
548 static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) {
549 return _mm256_cmplt_epu32(a, b);
551 static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) {
552 return _mm256_cmpgt_epu32(a, b);
558 static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); }
559 static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); }
560 static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
563 static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; }
566 template<> struct VectorHelper<signed short> {
567 typedef VectorTypeHelper<signed short>::Type VectorType;
568 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
569 typedef const VectorType & VTArg;
571 typedef const VectorType VTArg;
573 typedef signed short EntryType;
574 typedef int ConcatType;
576 static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); }
577 static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); }
578 static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); }
579 static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); }
580 static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
583 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm_setone_, SUFFIX)(); }
585 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
586 return CAT(_mm_slli_, SUFFIX)(a, shift);
588 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
589 return CAT(_mm_srai_, SUFFIX)(a, shift);
591 static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
592 static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
593 const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
594 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
597 static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) {
598 v1 = add(mul(v1, v2), v3);
601 static Vc_INTRINSIC VectorType Vc_CONST abs(VTArg a) { return _mm_abs_epi16(a); }
602 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); }
603 static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epi16(a, b); }
604 static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epi16(a, b); }
606 static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) {
607 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
608 VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
609 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
610 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
611 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
613 static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) {
614 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
615 VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
616 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
617 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
618 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
620 static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) {
621 VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
622 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
623 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
624 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
626 static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) {
627 VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
628 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
629 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
630 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
633 static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); }
634 static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); }
635 static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); }
636 static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); }
637 static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); }
638 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
639 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
640 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
641 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
643 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; }
646 template<> struct VectorHelper<unsigned short> {
647 typedef VectorTypeHelper<unsigned short>::Type VectorType;
648 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
649 typedef const VectorType & VTArg;
651 typedef const VectorType VTArg;
653 typedef unsigned short EntryType;
654 typedef unsigned int ConcatType;
656 static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); }
657 static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); }
658 static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); }
659 static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); }
660 static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
661 static Vc_INTRINSIC VectorType Vc_CONST one() { return _mm_setone_epu16(); }
663 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); }
664 static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epu16(a, b); }
665 static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epu16(a, b); }
668 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
669 return CAT(_mm_slli_, SUFFIX)(a, shift);
671 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
672 return CAT(_mm_srli_, SUFFIX)(a, shift);
674 static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) {
675 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
676 VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
677 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
678 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
679 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
681 static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) {
682 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
683 VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
684 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
685 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
686 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
688 static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) {
689 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
690 VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
691 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
692 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
693 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
695 static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) {
696 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
697 VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
698 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
699 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
700 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
702 static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
703 static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c,
704 const EntryType d, const EntryType e, const EntryType f,
705 const EntryType g, const EntryType h) {
706 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
708 static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
710 static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); }
711 static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); }
712 static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); }
713 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
715 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
716 static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epu16(a, b); }
717 static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epu16(a, b); }
719 static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); }
720 static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); }
722 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
723 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
724 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
726 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; }
734 template<> struct VectorHelper<char>
736 typedef VectorTypeHelper<char>::Type VectorType;
737 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
738 typedef const VectorType & VTArg;
740 typedef const VectorType VTArg;
742 typedef char EntryType;
743 typedef short ConcatType;
746 template<> struct VectorHelper<unsigned char>
748 typedef VectorTypeHelper<unsigned char>::Type VectorType;
749 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
750 typedef const VectorType & VTArg;
752 typedef const VectorType VTArg;
754 typedef unsigned char EntryType;
755 typedef unsigned short ConcatType;
760 } // namespace AliRoot
762 #include "vectorhelper.tcc"
763 #include "undomacros.h"
765 #endif // AVX_VECTORHELPER_H