]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/include/Vc/avx/vectorhelper.h
Update master to aliroot
[u/mrichter/AliRoot.git] / Vc / include / Vc / avx / vectorhelper.h
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
20#ifndef AVX_VECTORHELPER_H
21#define AVX_VECTORHELPER_H
22
23#include <limits>
24#include "types.h"
25#include "intrinsics.h"
26#include "casts.h"
c017a39f 27#include "macros.h"
f22341db 28
c017a39f 29namespace AliRoot {
f22341db 30namespace Vc
31{
32namespace AVX
33{
f22341db 34
c017a39f 35namespace Internal
36{
37Vc_INTRINSIC Vc_CONST m256 exponent(param256 v)
38{
39 m128i tmp0 = _mm_srli_epi32(avx_cast<m128i>(v), 23);
40 m128i tmp1 = _mm_srli_epi32(avx_cast<m128i>(hi128(v)), 23);
41 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
42 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
43 return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
44}
45Vc_INTRINSIC Vc_CONST m256d exponent(param256d v)
46{
47 m128i tmp0 = _mm_srli_epi64(avx_cast<m128i>(v), 52);
48 m128i tmp1 = _mm_srli_epi64(avx_cast<m128i>(hi128(v)), 52);
49 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
50 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
51 return _mm256_cvtepi32_pd(avx_cast<m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<m128>(tmp0), avx_cast<m128>(tmp1))));
52}
53} // namespace Internal
54
55#define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
56#define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a) { return code; }
57#define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b) { return code; }
58#define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b, VTArg c) { return code; }
59
60 template<> struct VectorHelper<m256>
f22341db 61 {
c017a39f 62 typedef m256 VectorType;
63#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
64 typedef const VectorType & VTArg;
65#else
66 typedef const VectorType VTArg;
67#endif
68 template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
69 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R;
70 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
71 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
72 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
73 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R;
74 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
75 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
76 static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
77
78 static Vc_ALWAYS_INLINE Vc_CONST VectorType cdab(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); }
79 static Vc_ALWAYS_INLINE Vc_CONST VectorType badc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); }
80 static Vc_ALWAYS_INLINE Vc_CONST VectorType aaaa(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); }
81 static Vc_ALWAYS_INLINE Vc_CONST VectorType bbbb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); }
82 static Vc_ALWAYS_INLINE Vc_CONST VectorType cccc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); }
83 static Vc_ALWAYS_INLINE Vc_CONST VectorType dddd(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); }
84 static Vc_ALWAYS_INLINE Vc_CONST VectorType dacb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); }
f22341db 85
86 OP0(allone, _mm256_setallone_ps())
87 OP0(zero, _mm256_setzero_ps())
88 OP2(or_, _mm256_or_ps(a, b))
89 OP2(xor_, _mm256_xor_ps(a, b))
90 OP2(and_, _mm256_and_ps(a, b))
91 OP2(andnot_, _mm256_andnot_ps(a, b))
92 OP3(blend, _mm256_blendv_ps(a, b, c))
93 };
94
c017a39f 95 template<> struct VectorHelper<m256d>
f22341db 96 {
c017a39f 97 typedef m256d VectorType;
98#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
99 typedef const VectorType & VTArg;
100#else
101 typedef const VectorType VTArg;
102#endif
103 template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
104 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R;
105 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
106 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
107 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
108 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R;
109 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
110 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
111 static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
112
113 static VectorType cdab(VTArg x) { return _mm256_permute_pd(x, 5); }
114 static VectorType badc(VTArg x) { return _mm256_permute2f128_pd(x, x, 1); }
f22341db 115 // aaaa bbbb cccc dddd specialized in vector.tcc
c017a39f 116 static VectorType dacb(VTArg x) {
117 const m128d cb = avx_cast<m128d>(_mm_alignr_epi8(avx_cast<m128i>(lo128(x)),
118 avx_cast<m128i>(hi128(x)), sizeof(double))); // XXX: lo and hi swapped?
119 const m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped?
f22341db 120 return concat(cb, da);
121 }
122
123 OP0(allone, _mm256_setallone_pd())
124 OP0(zero, _mm256_setzero_pd())
125 OP2(or_, _mm256_or_pd(a, b))
126 OP2(xor_, _mm256_xor_pd(a, b))
127 OP2(and_, _mm256_and_pd(a, b))
128 OP2(andnot_, _mm256_andnot_pd(a, b))
129 OP3(blend, _mm256_blendv_pd(a, b, c))
130 };
131
c017a39f 132 template<> struct VectorHelper<m256i>
f22341db 133 {
c017a39f 134 typedef m256i VectorType;
135#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
136 typedef const VectorType & VTArg;
137#else
138 typedef const VectorType VTArg;
139#endif
140 template<typename T> static VectorType load(const T *x, AlignedFlag) Vc_PURE;
141 template<typename T> static VectorType load(const T *x, UnalignedFlag) Vc_PURE;
142 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE;
143 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE;
144 template<typename T> static void store(T *mem, VTArg x, AlignedFlag);
145 template<typename T> static void store(T *mem, VTArg x, UnalignedFlag);
146 template<typename T> static void store(T *mem, VTArg x, StreamingAndAlignedFlag);
147 template<typename T> static void store(T *mem, VTArg x, StreamingAndUnalignedFlag);
148 template<typename T> static void store(T *mem, VTArg x, VTArg m, AlignedFlag);
149 template<typename T> static void store(T *mem, VTArg x, VTArg m, UnalignedFlag);
150 template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag);
151 template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag);
152
153 static VectorType cdab(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(2, 3, 0, 1))); }
154 static VectorType badc(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(1, 0, 3, 2))); }
155 static VectorType aaaa(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(0, 0, 0, 0))); }
156 static VectorType bbbb(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(1, 1, 1, 1))); }
157 static VectorType cccc(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(2, 2, 2, 2))); }
158 static VectorType dddd(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(3, 3, 3, 3))); }
159 static VectorType dacb(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(3, 0, 2, 1))); }
f22341db 160
161 OP0(allone, _mm256_setallone_si256())
162 OP0(zero, _mm256_setzero_si256())
163 OP2(or_, _mm256_or_si256(a, b))
164 OP2(xor_, _mm256_xor_si256(a, b))
165 OP2(and_, _mm256_and_si256(a, b))
166 OP2(andnot_, _mm256_andnot_si256(a, b))
167 OP3(blend, _mm256_blendv_epi8(a, b, c))
168 };
169
c017a39f 170 template<> struct VectorHelper<m128i>
f22341db 171 {
c017a39f 172 typedef m128i VectorType;
173#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
174 typedef const VectorType & VTArg;
175#else
176 typedef const VectorType VTArg;
177#endif
178 template<typename T> static VectorType load(const T *x, AlignedFlag) Vc_PURE;
179 template<typename T> static VectorType load(const T *x, UnalignedFlag) Vc_PURE;
180 template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE;
181 template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE;
182 template<typename T> static void store(T *mem, VTArg x, AlignedFlag);
183 template<typename T> static void store(T *mem, VTArg x, UnalignedFlag);
184 template<typename T> static void store(T *mem, VTArg x, StreamingAndAlignedFlag);
185 template<typename T> static void store(T *mem, VTArg x, StreamingAndUnalignedFlag);
186 template<typename T> static void store(T *mem, VTArg x, VTArg m, AlignedFlag);
187 template<typename T> static void store(T *mem, VTArg x, VTArg m, UnalignedFlag);
188 template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag);
189 template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag);
190
191 static VectorType cdab(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); }
192 static VectorType badc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)); }
193 static VectorType aaaa(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(0, 0, 0, 0)); }
194 static VectorType bbbb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 1, 1, 1)); }
195 static VectorType cccc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 2, 2, 2)); }
196 static VectorType dddd(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 3, 3, 3)); }
197 static VectorType dacb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 0, 2, 1)); }
f22341db 198
199 OP0(allone, _mm_setallone_si128())
200 OP0(zero, _mm_setzero_si128())
201 OP2(or_, _mm_or_si128(a, b))
202 OP2(xor_, _mm_xor_si128(a, b))
203 OP2(and_, _mm_and_si128(a, b))
204 OP2(andnot_, _mm_andnot_si128(a, b))
205 OP3(blend, _mm_blendv_epi8(a, b, c))
206 };
207#undef OP1
208#undef OP2
209#undef OP3
210
211#define OP1(op) \
c017a39f 212 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return CAT(_mm256_##op##_, SUFFIX)(a); }
f22341db 213#define OP(op) \
c017a39f 214 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); }
f22341db 215#define OP_(op) \
c017a39f 216 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op , SUFFIX)(a, b); }
f22341db 217#define OPx(op, op2) \
c017a39f 218 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); }
f22341db 219#define OPcmp(op) \
c017a39f 220 static Vc_INTRINSIC VectorType Vc_CONST cmp##op(VTArg a, VTArg b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); }
f22341db 221#define OP_CAST_(op) \
c017a39f 222 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_castps_, SUFFIX)( \
f22341db 223 _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \
224 CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \
225 }
226#define MINMAX \
c017a39f 227 static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \
228 static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return CAT(_mm256_max_, SUFFIX)(a, b); }
f22341db 229
230 template<> struct VectorHelper<double> {
c017a39f 231 typedef m256d VectorType;
232#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
233 typedef const VectorType & VTArg;
234#else
235 typedef const VectorType VTArg;
236#endif
f22341db 237 typedef double EntryType;
238 typedef double ConcatType;
239#define SUFFIX pd
240
c017a39f 241 static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); }
242 static Vc_ALWAYS_INLINE VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); }
243 static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
f22341db 244 return CAT(_mm256_set_, SUFFIX)(a, b, c, d);
245 }
c017a39f 246 static Vc_ALWAYS_INLINE VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
247 static Vc_ALWAYS_INLINE VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); }
f22341db 248
c017a39f 249 static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
250#ifdef VC_IMPL_FMA4
251 v1 = _mm256_macc_pd(v1, v2, v3);
252#else
253 VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
254 VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
255#if defined(VC_GCC) && VC_GCC < 0x40703
256 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
257 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
258 asm("":"+x"(h1), "+x"(h2));
259#endif
260 const VectorType l1 = _mm256_sub_pd(v1, h1);
261 const VectorType l2 = _mm256_sub_pd(v2, h2);
262 const VectorType ll = mul(l1, l2);
263 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
264 const VectorType hh = mul(h1, h2);
265 // ll < lh < hh for all entries is certain
266 const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3|
267 const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
268 const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
269 v1 = add(add(ll, b), add(c, hh));
270#endif
f22341db 271 }
272
273 OP(add) OP(sub) OP(mul)
274 OPcmp(eq) OPcmp(neq)
275 OPcmp(lt) OPcmp(nlt)
276 OPcmp(le) OPcmp(nle)
277
278 OP1(sqrt)
c017a39f 279 static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
f22341db 280 return _mm256_div_pd(one(), sqrt(x));
281 }
c017a39f 282 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
f22341db 283 return _mm256_div_pd(one(), x);
284 }
c017a39f 285 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) {
f22341db 286 return _mm256_cmpunord_pd(x, x);
287 }
c017a39f 288 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) {
f22341db 289 return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x));
290 }
c017a39f 291 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
f22341db 292 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd());
293 }
294
295 MINMAX
c017a39f 296 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
297 m128d b = _mm_min_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
f22341db 298 b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
299 return _mm_cvtsd_f64(b);
300 }
c017a39f 301 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
302 m128d b = _mm_max_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
f22341db 303 b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
304 return _mm_cvtsd_f64(b);
305 }
c017a39f 306 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
307 m128d b = _mm_mul_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
f22341db 308 b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
309 return _mm_cvtsd_f64(b);
310 }
c017a39f 311 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
312 m128d b = _mm_add_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
f22341db 313 b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
314 return _mm_cvtsd_f64(b);
315 }
316#undef SUFFIX
c017a39f 317 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
f22341db 318 return _mm256_round_pd(a, _MM_FROUND_NINT);
319 }
320 };
321
322 template<> struct VectorHelper<float> {
323 typedef float EntryType;
c017a39f 324 typedef m256 VectorType;
325#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
326 typedef const VectorType & VTArg;
327#else
328 typedef const VectorType VTArg;
329#endif
f22341db 330 typedef double ConcatType;
331#define SUFFIX ps
332
c017a39f 333 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); }
334 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); }
335 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
f22341db 336 const float e, const float f, const float g, const float h) {
337 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
c017a39f 338 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
339 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); }
340 static Vc_ALWAYS_INLINE Vc_CONST m256 concat(param256d a, param256d b) { return _mm256_insertf128_ps(avx_cast<m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
f22341db 341
c017a39f 342 static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
343#ifdef VC_IMPL_FMA4
344 v1 = _mm256_macc_ps(v1, v2, v3);
345#else
346 m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
347 m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
348 m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
349 m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
350 m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
351 m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
352 v1 = AVX::concat(
353 _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
354 _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
355#endif
f22341db 356 }
357
358 OP(add) OP(sub) OP(mul)
359 OPcmp(eq) OPcmp(neq)
360 OPcmp(lt) OPcmp(nlt)
361 OPcmp(le) OPcmp(nle)
362
363 OP1(sqrt) OP1(rsqrt)
c017a39f 364 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) {
f22341db 365 return _mm256_cmpunord_ps(x, x);
366 }
c017a39f 367 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) {
f22341db 368 return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x));
369 }
c017a39f 370 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
f22341db 371 return _mm256_rcp_ps(x);
372 }
c017a39f 373 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
f22341db 374 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps());
375 }
376
377 MINMAX
c017a39f 378 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
379 m128 b = _mm_min_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
f22341db 380 b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
381 b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
382 return _mm_cvtss_f32(b);
383 }
c017a39f 384 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
385 m128 b = _mm_max_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
f22341db 386 b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
387 b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
388 return _mm_cvtss_f32(b);
389 }
c017a39f 390 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
391 m128 b = _mm_mul_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
f22341db 392 b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
393 b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
394 return _mm_cvtss_f32(b);
395 }
c017a39f 396 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
397 m128 b = _mm_add_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
f22341db 398 b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
399 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
400 return _mm_cvtss_f32(b);
401 }
402#undef SUFFIX
c017a39f 403 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
f22341db 404 return _mm256_round_ps(a, _MM_FROUND_NINT);
405 }
406 };
407
408 template<> struct VectorHelper<sfloat> : public VectorHelper<float> {};
409
410 template<> struct VectorHelper<int> {
411 typedef int EntryType;
c017a39f 412 typedef m256i VectorType;
413#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
414 typedef const VectorType & VTArg;
415#else
416 typedef const VectorType VTArg;
417#endif
f22341db 418 typedef long long ConcatType;
419#define SUFFIX si256
420
421 OP_(or_) OP_(and_) OP_(xor_)
c017a39f 422 static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
423 static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
f22341db 424#undef SUFFIX
425#define SUFFIX epi32
c017a39f 426 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
f22341db 427
c017a39f 428 static Vc_INTRINSIC VectorType Vc_CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
429 static Vc_INTRINSIC VectorType Vc_CONST set(const int a, const int b, const int c, const int d,
f22341db 430 const int e, const int f, const int g, const int h) {
431 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
432
c017a39f 433 static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
f22341db 434
c017a39f 435 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
f22341db 436 return CAT(_mm256_slli_, SUFFIX)(a, shift);
437 }
c017a39f 438 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
f22341db 439 return CAT(_mm256_srai_, SUFFIX)(a, shift);
440 }
441 OP1(abs)
442
443 MINMAX
c017a39f 444 static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) {
445 m128i b = _mm_min_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
f22341db 446 b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
447 b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
448 return _mm_cvtsi128_si32(b);
449 }
c017a39f 450 static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) {
451 m128i b = _mm_max_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
f22341db 452 b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
453 b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
454 return _mm_cvtsi128_si32(b);
455 }
c017a39f 456 static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) {
457 m128i b = _mm_add_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
f22341db 458 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
459 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
460 return _mm_cvtsi128_si32(b);
461 }
c017a39f 462 static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) {
463 m128i b = _mm_mullo_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
f22341db 464 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
465 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
466 return _mm_cvtsi128_si32(b);
467 }
468
c017a39f 469 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); }
f22341db 470
471 OP(add) OP(sub)
472 OPcmp(eq)
473 OPcmp(lt)
474 OPcmp(gt)
c017a39f 475 static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { m256i x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
476 static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { m256i x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
477 static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { m256i x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
478 static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
f22341db 479#undef SUFFIX
c017a39f 480 static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; }
f22341db 481 };
482
483 template<> struct VectorHelper<unsigned int> {
484 typedef unsigned int EntryType;
c017a39f 485 typedef m256i VectorType;
486#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
487 typedef const VectorType & VTArg;
488#else
489 typedef const VectorType VTArg;
490#endif
f22341db 491 typedef unsigned long long ConcatType;
492#define SUFFIX si256
493 OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
c017a39f 494 static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
495 static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
f22341db 496
497#undef SUFFIX
498#define SUFFIX epu32
c017a39f 499 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
f22341db 500
501 MINMAX
c017a39f 502 static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) {
503 m128i b = _mm_min_epu32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
f22341db 504 b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
505 b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
506 return _mm_cvtsi128_si32(b);
507 }
c017a39f 508 static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) {
509 m128i b = _mm_max_epu32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
f22341db 510 b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
511 b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
512 return _mm_cvtsi128_si32(b);
513 }
c017a39f 514 static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) {
515 m128i b = _mm_add_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
f22341db 516 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
517 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
518 return _mm_cvtsi128_si32(b);
519 }
c017a39f 520 static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) {
521 m128i b = _mm_mullo_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
f22341db 522 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
523 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
524 return _mm_cvtsi128_si32(b);
525 }
526
c017a39f 527 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); }
528 static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
f22341db 529
530#undef SUFFIX
531#define SUFFIX epi32
c017a39f 532 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
f22341db 533 return CAT(_mm256_slli_, SUFFIX)(a, shift);
534 }
c017a39f 535 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
f22341db 536 return CAT(_mm256_srli_, SUFFIX)(a, shift);
537 }
c017a39f 538 static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
539 static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d,
f22341db 540 const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) {
541 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
542
543 OP(add) OP(sub)
544 OPcmp(eq)
c017a39f 545 static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); }
f22341db 546
547#ifndef USE_INCORRECT_UNSIGNED_COMPARE
c017a39f 548 static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) {
f22341db 549 return _mm256_cmplt_epu32(a, b);
550 }
c017a39f 551 static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) {
f22341db 552 return _mm256_cmpgt_epu32(a, b);
553 }
554#else
555 OPcmp(lt)
556 OPcmp(gt)
557#endif
c017a39f 558 static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); }
559 static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); }
560 static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
f22341db 561
562#undef SUFFIX
c017a39f 563 static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; }
f22341db 564 };
565
566 template<> struct VectorHelper<signed short> {
567 typedef VectorTypeHelper<signed short>::Type VectorType;
c017a39f 568#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
569 typedef const VectorType & VTArg;
570#else
571 typedef const VectorType VTArg;
572#endif
f22341db 573 typedef signed short EntryType;
574 typedef int ConcatType;
575
c017a39f 576 static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); }
577 static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); }
578 static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); }
579 static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); }
580 static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
f22341db 581
582#define SUFFIX epi16
c017a39f 583 static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm_setone_, SUFFIX)(); }
f22341db 584
c017a39f 585 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
f22341db 586 return CAT(_mm_slli_, SUFFIX)(a, shift);
587 }
c017a39f 588 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
f22341db 589 return CAT(_mm_srai_, SUFFIX)(a, shift);
590 }
c017a39f 591 static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
592 static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
f22341db 593 const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
594 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
595 }
596
c017a39f 597 static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) {
598 v1 = add(mul(v1, v2), v3);
599 }
f22341db 600
c017a39f 601 static Vc_INTRINSIC VectorType Vc_CONST abs(VTArg a) { return _mm_abs_epi16(a); }
602 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); }
603 static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epi16(a, b); }
604 static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epi16(a, b); }
f22341db 605
c017a39f 606 static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) {
f22341db 607 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
c017a39f 608 VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
f22341db 609 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
610 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
611 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
612 }
c017a39f 613 static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) {
f22341db 614 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
c017a39f 615 VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
f22341db 616 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
617 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
618 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
619 }
c017a39f 620 static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) {
621 VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
f22341db 622 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
623 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
624 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
625 }
c017a39f 626 static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) {
627 VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
f22341db 628 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
629 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
630 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
631 }
632
c017a39f 633 static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); }
634 static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); }
635 static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); }
636 static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); }
637 static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); }
638 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
639 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
640 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
641 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
f22341db 642#undef SUFFIX
c017a39f 643 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; }
f22341db 644 };
645
646 template<> struct VectorHelper<unsigned short> {
647 typedef VectorTypeHelper<unsigned short>::Type VectorType;
c017a39f 648#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
649 typedef const VectorType & VTArg;
650#else
651 typedef const VectorType VTArg;
652#endif
f22341db 653 typedef unsigned short EntryType;
654 typedef unsigned int ConcatType;
655
c017a39f 656 static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); }
657 static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); }
658 static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); }
659 static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); }
660 static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
661 static Vc_INTRINSIC VectorType Vc_CONST one() { return _mm_setone_epu16(); }
f22341db 662
c017a39f 663 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); }
664 static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epu16(a, b); }
665 static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epu16(a, b); }
f22341db 666
667#define SUFFIX epi16
c017a39f 668 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
f22341db 669 return CAT(_mm_slli_, SUFFIX)(a, shift);
670 }
c017a39f 671 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
f22341db 672 return CAT(_mm_srli_, SUFFIX)(a, shift);
673 }
c017a39f 674 static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) {
f22341db 675 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
c017a39f 676 VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
f22341db 677 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
678 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
679 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
680 }
c017a39f 681 static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) {
f22341db 682 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
c017a39f 683 VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
f22341db 684 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
685 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
686 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
687 }
c017a39f 688 static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) {
f22341db 689 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
c017a39f 690 VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
f22341db 691 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
692 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
693 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
694 }
c017a39f 695 static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) {
f22341db 696 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
c017a39f 697 VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
f22341db 698 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
699 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
700 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
701 }
c017a39f 702 static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
703 static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c,
f22341db 704 const EntryType d, const EntryType e, const EntryType f,
705 const EntryType g, const EntryType h) {
706 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
707 }
c017a39f 708 static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
f22341db 709
c017a39f 710 static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); }
711 static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); }
712 static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); }
713 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
f22341db 714
715#ifndef USE_INCORRECT_UNSIGNED_COMPARE
c017a39f 716 static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epu16(a, b); }
717 static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epu16(a, b); }
f22341db 718#else
c017a39f 719 static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); }
720 static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); }
f22341db 721#endif
c017a39f 722 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
723 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
724 static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
f22341db 725#undef SUFFIX
c017a39f 726 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; }
f22341db 727 };
728#undef OP1
729#undef OP
730#undef OP_
731#undef OPx
732#undef OPcmp
733
734template<> struct VectorHelper<char>
735{
736 typedef VectorTypeHelper<char>::Type VectorType;
c017a39f 737#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
738 typedef const VectorType & VTArg;
739#else
740 typedef const VectorType VTArg;
741#endif
f22341db 742 typedef char EntryType;
743 typedef short ConcatType;
744};
745
746template<> struct VectorHelper<unsigned char>
747{
748 typedef VectorTypeHelper<unsigned char>::Type VectorType;
c017a39f 749#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
750 typedef const VectorType & VTArg;
751#else
752 typedef const VectorType VTArg;
753#endif
f22341db 754 typedef unsigned char EntryType;
755 typedef unsigned short ConcatType;
756};
757
758} // namespace AVX
759} // namespace Vc
c017a39f 760} // namespace AliRoot
f22341db 761
762#include "vectorhelper.tcc"
c017a39f 763#include "undomacros.h"
f22341db 764
765#endif // AVX_VECTORHELPER_H