]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #ifndef AVX_VECTORHELPER_H | |
21 | #define AVX_VECTORHELPER_H | |
22 | ||
23 | #include <limits> | |
24 | #include "types.h" | |
25 | #include "intrinsics.h" | |
26 | #include "casts.h" | |
c017a39f | 27 | #include "macros.h" |
f22341db | 28 | |
c017a39f | 29 | namespace AliRoot { |
f22341db | 30 | namespace Vc |
31 | { | |
32 | namespace AVX | |
33 | { | |
f22341db | 34 | |
c017a39f | 35 | namespace Internal |
36 | { | |
37 | Vc_INTRINSIC Vc_CONST m256 exponent(param256 v) | |
38 | { | |
39 | m128i tmp0 = _mm_srli_epi32(avx_cast<m128i>(v), 23); | |
40 | m128i tmp1 = _mm_srli_epi32(avx_cast<m128i>(hi128(v)), 23); | |
41 | tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); | |
42 | tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); | |
43 | return _mm256_cvtepi32_ps(concat(tmp0, tmp1)); | |
44 | } | |
45 | Vc_INTRINSIC Vc_CONST m256d exponent(param256d v) | |
46 | { | |
47 | m128i tmp0 = _mm_srli_epi64(avx_cast<m128i>(v), 52); | |
48 | m128i tmp1 = _mm_srli_epi64(avx_cast<m128i>(hi128(v)), 52); | |
49 | tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff)); | |
50 | tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff)); | |
51 | return _mm256_cvtepi32_pd(avx_cast<m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<m128>(tmp0), avx_cast<m128>(tmp1)))); | |
52 | } | |
53 | } // namespace Internal | |
54 | ||
55 | #define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } | |
56 | #define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a) { return code; } | |
57 | #define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b) { return code; } | |
58 | #define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b, VTArg c) { return code; } | |
59 | ||
60 | template<> struct VectorHelper<m256> | |
f22341db | 61 | { |
c017a39f | 62 | typedef m256 VectorType; |
63 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN | |
64 | typedef const VectorType & VTArg; | |
65 | #else | |
66 | typedef const VectorType VTArg; | |
67 | #endif | |
68 | template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; | |
69 | static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; | |
70 | static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; | |
71 | static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; | |
72 | static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; | |
73 | static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; | |
74 | static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; | |
75 | static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; | |
76 | static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; | |
77 | ||
78 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cdab(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); } | |
79 | static Vc_ALWAYS_INLINE Vc_CONST VectorType badc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); } | |
80 | static Vc_ALWAYS_INLINE Vc_CONST VectorType aaaa(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); } | |
81 | static Vc_ALWAYS_INLINE Vc_CONST VectorType bbbb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); } | |
82 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cccc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); } | |
83 | static Vc_ALWAYS_INLINE Vc_CONST VectorType dddd(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); } | |
84 | static Vc_ALWAYS_INLINE Vc_CONST VectorType dacb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); } | |
f22341db | 85 | |
86 | OP0(allone, _mm256_setallone_ps()) | |
87 | OP0(zero, _mm256_setzero_ps()) | |
88 | OP2(or_, _mm256_or_ps(a, b)) | |
89 | OP2(xor_, _mm256_xor_ps(a, b)) | |
90 | OP2(and_, _mm256_and_ps(a, b)) | |
91 | OP2(andnot_, _mm256_andnot_ps(a, b)) | |
92 | OP3(blend, _mm256_blendv_ps(a, b, c)) | |
93 | }; | |
94 | ||
c017a39f | 95 | template<> struct VectorHelper<m256d> |
f22341db | 96 | { |
c017a39f | 97 | typedef m256d VectorType; |
98 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN | |
99 | typedef const VectorType & VTArg; | |
100 | #else | |
101 | typedef const VectorType VTArg; | |
102 | #endif | |
103 | template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; | |
104 | static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; | |
105 | static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; | |
106 | static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; | |
107 | static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; | |
108 | static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; | |
109 | static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; | |
110 | static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; | |
111 | static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; | |
112 | ||
113 | static VectorType cdab(VTArg x) { return _mm256_permute_pd(x, 5); } | |
114 | static VectorType badc(VTArg x) { return _mm256_permute2f128_pd(x, x, 1); } | |
f22341db | 115 | // aaaa bbbb cccc dddd specialized in vector.tcc |
c017a39f | 116 | static VectorType dacb(VTArg x) { |
117 | const m128d cb = avx_cast<m128d>(_mm_alignr_epi8(avx_cast<m128i>(lo128(x)), | |
118 | avx_cast<m128i>(hi128(x)), sizeof(double))); // XXX: lo and hi swapped? | |
119 | const m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped? | |
f22341db | 120 | return concat(cb, da); |
121 | } | |
122 | ||
123 | OP0(allone, _mm256_setallone_pd()) | |
124 | OP0(zero, _mm256_setzero_pd()) | |
125 | OP2(or_, _mm256_or_pd(a, b)) | |
126 | OP2(xor_, _mm256_xor_pd(a, b)) | |
127 | OP2(and_, _mm256_and_pd(a, b)) | |
128 | OP2(andnot_, _mm256_andnot_pd(a, b)) | |
129 | OP3(blend, _mm256_blendv_pd(a, b, c)) | |
130 | }; | |
131 | ||
c017a39f | 132 | template<> struct VectorHelper<m256i> |
f22341db | 133 | { |
c017a39f | 134 | typedef m256i VectorType; |
135 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN | |
136 | typedef const VectorType & VTArg; | |
137 | #else | |
138 | typedef const VectorType VTArg; | |
139 | #endif | |
140 | template<typename T> static VectorType load(const T *x, AlignedFlag) Vc_PURE; | |
141 | template<typename T> static VectorType load(const T *x, UnalignedFlag) Vc_PURE; | |
142 | template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE; | |
143 | template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE; | |
144 | template<typename T> static void store(T *mem, VTArg x, AlignedFlag); | |
145 | template<typename T> static void store(T *mem, VTArg x, UnalignedFlag); | |
146 | template<typename T> static void store(T *mem, VTArg x, StreamingAndAlignedFlag); | |
147 | template<typename T> static void store(T *mem, VTArg x, StreamingAndUnalignedFlag); | |
148 | template<typename T> static void store(T *mem, VTArg x, VTArg m, AlignedFlag); | |
149 | template<typename T> static void store(T *mem, VTArg x, VTArg m, UnalignedFlag); | |
150 | template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag); | |
151 | template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag); | |
152 | ||
153 | static VectorType cdab(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(2, 3, 0, 1))); } | |
154 | static VectorType badc(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(1, 0, 3, 2))); } | |
155 | static VectorType aaaa(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(0, 0, 0, 0))); } | |
156 | static VectorType bbbb(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(1, 1, 1, 1))); } | |
157 | static VectorType cccc(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(2, 2, 2, 2))); } | |
158 | static VectorType dddd(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(3, 3, 3, 3))); } | |
159 | static VectorType dacb(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(3, 0, 2, 1))); } | |
f22341db | 160 | |
161 | OP0(allone, _mm256_setallone_si256()) | |
162 | OP0(zero, _mm256_setzero_si256()) | |
163 | OP2(or_, _mm256_or_si256(a, b)) | |
164 | OP2(xor_, _mm256_xor_si256(a, b)) | |
165 | OP2(and_, _mm256_and_si256(a, b)) | |
166 | OP2(andnot_, _mm256_andnot_si256(a, b)) | |
167 | OP3(blend, _mm256_blendv_epi8(a, b, c)) | |
168 | }; | |
169 | ||
c017a39f | 170 | template<> struct VectorHelper<m128i> |
f22341db | 171 | { |
c017a39f | 172 | typedef m128i VectorType; |
173 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN | |
174 | typedef const VectorType & VTArg; | |
175 | #else | |
176 | typedef const VectorType VTArg; | |
177 | #endif | |
178 | template<typename T> static VectorType load(const T *x, AlignedFlag) Vc_PURE; | |
179 | template<typename T> static VectorType load(const T *x, UnalignedFlag) Vc_PURE; | |
180 | template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE; | |
181 | template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE; | |
182 | template<typename T> static void store(T *mem, VTArg x, AlignedFlag); | |
183 | template<typename T> static void store(T *mem, VTArg x, UnalignedFlag); | |
184 | template<typename T> static void store(T *mem, VTArg x, StreamingAndAlignedFlag); | |
185 | template<typename T> static void store(T *mem, VTArg x, StreamingAndUnalignedFlag); | |
186 | template<typename T> static void store(T *mem, VTArg x, VTArg m, AlignedFlag); | |
187 | template<typename T> static void store(T *mem, VTArg x, VTArg m, UnalignedFlag); | |
188 | template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag); | |
189 | template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag); | |
190 | ||
191 | static VectorType cdab(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); } | |
192 | static VectorType badc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)); } | |
193 | static VectorType aaaa(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(0, 0, 0, 0)); } | |
194 | static VectorType bbbb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 1, 1, 1)); } | |
195 | static VectorType cccc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 2, 2, 2)); } | |
196 | static VectorType dddd(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 3, 3, 3)); } | |
197 | static VectorType dacb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 0, 2, 1)); } | |
f22341db | 198 | |
199 | OP0(allone, _mm_setallone_si128()) | |
200 | OP0(zero, _mm_setzero_si128()) | |
201 | OP2(or_, _mm_or_si128(a, b)) | |
202 | OP2(xor_, _mm_xor_si128(a, b)) | |
203 | OP2(and_, _mm_and_si128(a, b)) | |
204 | OP2(andnot_, _mm_andnot_si128(a, b)) | |
205 | OP3(blend, _mm_blendv_epi8(a, b, c)) | |
206 | }; | |
207 | #undef OP1 | |
208 | #undef OP2 | |
209 | #undef OP3 | |
210 | ||
211 | #define OP1(op) \ | |
c017a39f | 212 | static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return CAT(_mm256_##op##_, SUFFIX)(a); } |
f22341db | 213 | #define OP(op) \ |
c017a39f | 214 | static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); } |
f22341db | 215 | #define OP_(op) \ |
c017a39f | 216 | static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op , SUFFIX)(a, b); } |
f22341db | 217 | #define OPx(op, op2) \ |
c017a39f | 218 | static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); } |
f22341db | 219 | #define OPcmp(op) \ |
c017a39f | 220 | static Vc_INTRINSIC VectorType Vc_CONST cmp##op(VTArg a, VTArg b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); } |
f22341db | 221 | #define OP_CAST_(op) \ |
c017a39f | 222 | static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_castps_, SUFFIX)( \ |
f22341db | 223 | _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \ |
224 | CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \ | |
225 | } | |
226 | #define MINMAX \ | |
c017a39f | 227 | static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \ |
228 | static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return CAT(_mm256_max_, SUFFIX)(a, b); } | |
f22341db | 229 | |
230 | template<> struct VectorHelper<double> { | |
c017a39f | 231 | typedef m256d VectorType; |
232 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN | |
233 | typedef const VectorType & VTArg; | |
234 | #else | |
235 | typedef const VectorType VTArg; | |
236 | #endif | |
f22341db | 237 | typedef double EntryType; |
238 | typedef double ConcatType; | |
239 | #define SUFFIX pd | |
240 | ||
c017a39f | 241 | static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); } |
242 | static Vc_ALWAYS_INLINE VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); } | |
243 | static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) { | |
f22341db | 244 | return CAT(_mm256_set_, SUFFIX)(a, b, c, d); |
245 | } | |
c017a39f | 246 | static Vc_ALWAYS_INLINE VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } |
247 | static Vc_ALWAYS_INLINE VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); } | |
f22341db | 248 | |
c017a39f | 249 | static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { |
250 | #ifdef VC_IMPL_FMA4 | |
251 | v1 = _mm256_macc_pd(v1, v2, v3); | |
252 | #else | |
253 | VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble))); | |
254 | VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble))); | |
255 | #if defined(VC_GCC) && VC_GCC < 0x40703 | |
256 | // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot | |
257 | // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703 | |
258 | asm("":"+x"(h1), "+x"(h2)); | |
259 | #endif | |
260 | const VectorType l1 = _mm256_sub_pd(v1, h1); | |
261 | const VectorType l2 = _mm256_sub_pd(v2, h2); | |
262 | const VectorType ll = mul(l1, l2); | |
263 | const VectorType lh = add(mul(l1, h2), mul(h1, l2)); | |
264 | const VectorType hh = mul(h1, h2); | |
265 | // ll < lh < hh for all entries is certain | |
266 | const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3| | |
267 | const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3); | |
268 | const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3); | |
269 | v1 = add(add(ll, b), add(c, hh)); | |
270 | #endif | |
f22341db | 271 | } |
272 | ||
273 | OP(add) OP(sub) OP(mul) | |
274 | OPcmp(eq) OPcmp(neq) | |
275 | OPcmp(lt) OPcmp(nlt) | |
276 | OPcmp(le) OPcmp(nle) | |
277 | ||
278 | OP1(sqrt) | |
c017a39f | 279 | static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) { |
f22341db | 280 | return _mm256_div_pd(one(), sqrt(x)); |
281 | } | |
c017a39f | 282 | static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { |
f22341db | 283 | return _mm256_div_pd(one(), x); |
284 | } | |
c017a39f | 285 | static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) { |
f22341db | 286 | return _mm256_cmpunord_pd(x, x); |
287 | } | |
c017a39f | 288 | static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) { |
f22341db | 289 | return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x)); |
290 | } | |
c017a39f | 291 | static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { |
f22341db | 292 | return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd()); |
293 | } | |
294 | ||
295 | MINMAX | |
c017a39f | 296 | static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { |
297 | m128d b = _mm_min_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1)); | |
f22341db | 298 | b = _mm_min_sd(b, _mm_unpackhi_pd(b, b)); |
299 | return _mm_cvtsd_f64(b); | |
300 | } | |
c017a39f | 301 | static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { |
302 | m128d b = _mm_max_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1)); | |
f22341db | 303 | b = _mm_max_sd(b, _mm_unpackhi_pd(b, b)); |
304 | return _mm_cvtsd_f64(b); | |
305 | } | |
c017a39f | 306 | static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { |
307 | m128d b = _mm_mul_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1)); | |
f22341db | 308 | b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); |
309 | return _mm_cvtsd_f64(b); | |
310 | } | |
c017a39f | 311 | static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { |
312 | m128d b = _mm_add_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1)); | |
f22341db | 313 | b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); |
314 | return _mm_cvtsd_f64(b); | |
315 | } | |
316 | #undef SUFFIX | |
c017a39f | 317 | static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { |
f22341db | 318 | return _mm256_round_pd(a, _MM_FROUND_NINT); |
319 | } | |
320 | }; | |
321 | ||
322 | template<> struct VectorHelper<float> { | |
323 | typedef float EntryType; | |
c017a39f | 324 | typedef m256 VectorType; |
325 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN | |
326 | typedef const VectorType & VTArg; | |
327 | #else | |
328 | typedef const VectorType VTArg; | |
329 | #endif | |
f22341db | 330 | typedef double ConcatType; |
331 | #define SUFFIX ps | |
332 | ||
c017a39f | 333 | static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); } |
334 | static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); } | |
335 | static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d, | |
f22341db | 336 | const float e, const float f, const float g, const float h) { |
337 | return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } | |
c017a39f | 338 | static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } |
339 | static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); } | |
340 | static Vc_ALWAYS_INLINE Vc_CONST m256 concat(param256d a, param256d b) { return _mm256_insertf128_ps(avx_cast<m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); } | |
f22341db | 341 | |
c017a39f | 342 | static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { |
343 | #ifdef VC_IMPL_FMA4 | |
344 | v1 = _mm256_macc_ps(v1, v2, v3); | |
345 | #else | |
346 | m256d v1_0 = _mm256_cvtps_pd(lo128(v1)); | |
347 | m256d v1_1 = _mm256_cvtps_pd(hi128(v1)); | |
348 | m256d v2_0 = _mm256_cvtps_pd(lo128(v2)); | |
349 | m256d v2_1 = _mm256_cvtps_pd(hi128(v2)); | |
350 | m256d v3_0 = _mm256_cvtps_pd(lo128(v3)); | |
351 | m256d v3_1 = _mm256_cvtps_pd(hi128(v3)); | |
352 | v1 = AVX::concat( | |
353 | _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), | |
354 | _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); | |
355 | #endif | |
f22341db | 356 | } |
357 | ||
358 | OP(add) OP(sub) OP(mul) | |
359 | OPcmp(eq) OPcmp(neq) | |
360 | OPcmp(lt) OPcmp(nlt) | |
361 | OPcmp(le) OPcmp(nle) | |
362 | ||
363 | OP1(sqrt) OP1(rsqrt) | |
c017a39f | 364 | static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) { |
f22341db | 365 | return _mm256_cmpunord_ps(x, x); |
366 | } | |
c017a39f | 367 | static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) { |
f22341db | 368 | return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x)); |
369 | } | |
c017a39f | 370 | static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { |
f22341db | 371 | return _mm256_rcp_ps(x); |
372 | } | |
c017a39f | 373 | static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { |
f22341db | 374 | return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps()); |
375 | } | |
376 | ||
377 | MINMAX | |
c017a39f | 378 | static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { |
379 | m128 b = _mm_min_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1)); | |
f22341db | 380 | b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) |
381 | b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3 | |
382 | return _mm_cvtss_f32(b); | |
383 | } | |
c017a39f | 384 | static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { |
385 | m128 b = _mm_max_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1)); | |
f22341db | 386 | b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) |
387 | b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3 | |
388 | return _mm_cvtss_f32(b); | |
389 | } | |
c017a39f | 390 | static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { |
391 | m128 b = _mm_mul_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1)); | |
f22341db | 392 | b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); |
393 | b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); | |
394 | return _mm_cvtss_f32(b); | |
395 | } | |
c017a39f | 396 | static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { |
397 | m128 b = _mm_add_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1)); | |
f22341db | 398 | b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); |
399 | b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); | |
400 | return _mm_cvtss_f32(b); | |
401 | } | |
402 | #undef SUFFIX | |
c017a39f | 403 | static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { |
f22341db | 404 | return _mm256_round_ps(a, _MM_FROUND_NINT); |
405 | } | |
406 | }; | |
407 | ||
408 | template<> struct VectorHelper<sfloat> : public VectorHelper<float> {}; | |
409 | ||
410 | template<> struct VectorHelper<int> { | |
411 | typedef int EntryType; | |
c017a39f | 412 | typedef m256i VectorType; |
413 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN | |
414 | typedef const VectorType & VTArg; | |
415 | #else | |
416 | typedef const VectorType VTArg; | |
417 | #endif | |
f22341db | 418 | typedef long long ConcatType; |
419 | #define SUFFIX si256 | |
420 | ||
421 | OP_(or_) OP_(and_) OP_(xor_) | |
c017a39f | 422 | static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } |
423 | static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } | |
f22341db | 424 | #undef SUFFIX |
425 | #define SUFFIX epi32 | |
c017a39f | 426 | static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } |
f22341db | 427 | |
c017a39f | 428 | static Vc_INTRINSIC VectorType Vc_CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); } |
429 | static Vc_INTRINSIC VectorType Vc_CONST set(const int a, const int b, const int c, const int d, | |
f22341db | 430 | const int e, const int f, const int g, const int h) { |
431 | return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } | |
432 | ||
c017a39f | 433 | static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } |
f22341db | 434 | |
c017a39f | 435 | static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { |
f22341db | 436 | return CAT(_mm256_slli_, SUFFIX)(a, shift); |
437 | } | |
c017a39f | 438 | static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { |
f22341db | 439 | return CAT(_mm256_srai_, SUFFIX)(a, shift); |
440 | } | |
441 | OP1(abs) | |
442 | ||
443 | MINMAX | |
c017a39f | 444 | static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) { |
445 | m128i b = _mm_min_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1)); | |
f22341db | 446 | b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); |
447 | b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here | |
448 | return _mm_cvtsi128_si32(b); | |
449 | } | |
c017a39f | 450 | static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) { |
451 | m128i b = _mm_max_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1)); | |
f22341db | 452 | b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); |
453 | b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here | |
454 | return _mm_cvtsi128_si32(b); | |
455 | } | |
c017a39f | 456 | static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) { |
457 | m128i b = _mm_add_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1)); | |
f22341db | 458 | b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); |
459 | b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
460 | return _mm_cvtsi128_si32(b); | |
461 | } | |
c017a39f | 462 | static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) { |
463 | m128i b = _mm_mullo_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1)); | |
f22341db | 464 | b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); |
465 | b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
466 | return _mm_cvtsi128_si32(b); | |
467 | } | |
468 | ||
c017a39f | 469 | static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); } |
f22341db | 470 | |
471 | OP(add) OP(sub) | |
472 | OPcmp(eq) | |
473 | OPcmp(lt) | |
474 | OPcmp(gt) | |
c017a39f | 475 | static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { m256i x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } |
476 | static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { m256i x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } | |
477 | static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { m256i x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } | |
478 | static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } | |
f22341db | 479 | #undef SUFFIX |
c017a39f | 480 | static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; } |
f22341db | 481 | }; |
482 | ||
483 | template<> struct VectorHelper<unsigned int> { | |
484 | typedef unsigned int EntryType; | |
c017a39f | 485 | typedef m256i VectorType; |
486 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN | |
487 | typedef const VectorType & VTArg; | |
488 | #else | |
489 | typedef const VectorType VTArg; | |
490 | #endif | |
f22341db | 491 | typedef unsigned long long ConcatType; |
492 | #define SUFFIX si256 | |
493 | OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) | |
c017a39f | 494 | static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } |
495 | static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } | |
f22341db | 496 | |
497 | #undef SUFFIX | |
498 | #define SUFFIX epu32 | |
c017a39f | 499 | static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } |
f22341db | 500 | |
501 | MINMAX | |
c017a39f | 502 | static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) { |
503 | m128i b = _mm_min_epu32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1)); | |
f22341db | 504 | b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); |
505 | b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here | |
506 | return _mm_cvtsi128_si32(b); | |
507 | } | |
c017a39f | 508 | static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) { |
509 | m128i b = _mm_max_epu32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1)); | |
f22341db | 510 | b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); |
511 | b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here | |
512 | return _mm_cvtsi128_si32(b); | |
513 | } | |
c017a39f | 514 | static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) { |
515 | m128i b = _mm_add_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1)); | |
f22341db | 516 | b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); |
517 | b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
518 | return _mm_cvtsi128_si32(b); | |
519 | } | |
c017a39f | 520 | static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) { |
521 | m128i b = _mm_mullo_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1)); | |
f22341db | 522 | b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); |
523 | b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
524 | return _mm_cvtsi128_si32(b); | |
525 | } | |
526 | ||
c017a39f | 527 | static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); } |
528 | static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } | |
f22341db | 529 | |
530 | #undef SUFFIX | |
531 | #define SUFFIX epi32 | |
c017a39f | 532 | static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { |
f22341db | 533 | return CAT(_mm256_slli_, SUFFIX)(a, shift); |
534 | } | |
c017a39f | 535 | static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { |
f22341db | 536 | return CAT(_mm256_srli_, SUFFIX)(a, shift); |
537 | } | |
c017a39f | 538 | static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); } |
539 | static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d, | |
f22341db | 540 | const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) { |
541 | return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } | |
542 | ||
543 | OP(add) OP(sub) | |
544 | OPcmp(eq) | |
c017a39f | 545 | static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); } |
f22341db | 546 | |
547 | #ifndef USE_INCORRECT_UNSIGNED_COMPARE | |
c017a39f | 548 | static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { |
f22341db | 549 | return _mm256_cmplt_epu32(a, b); |
550 | } | |
c017a39f | 551 | static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { |
f22341db | 552 | return _mm256_cmpgt_epu32(a, b); |
553 | } | |
554 | #else | |
555 | OPcmp(lt) | |
556 | OPcmp(gt) | |
557 | #endif | |
c017a39f | 558 | static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); } |
559 | static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); } | |
560 | static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } | |
f22341db | 561 | |
562 | #undef SUFFIX | |
c017a39f | 563 | static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; } |
f22341db | 564 | }; |
565 | ||
566 | template<> struct VectorHelper<signed short> { | |
567 | typedef VectorTypeHelper<signed short>::Type VectorType; | |
c017a39f | 568 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN |
569 | typedef const VectorType & VTArg; | |
570 | #else | |
571 | typedef const VectorType VTArg; | |
572 | #endif | |
f22341db | 573 | typedef signed short EntryType; |
574 | typedef int ConcatType; | |
575 | ||
c017a39f | 576 | static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); } |
577 | static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); } | |
578 | static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); } | |
579 | static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); } | |
580 | static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } | |
f22341db | 581 | |
582 | #define SUFFIX epi16 | |
c017a39f | 583 | static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm_setone_, SUFFIX)(); } |
f22341db | 584 | |
c017a39f | 585 | static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { |
f22341db | 586 | return CAT(_mm_slli_, SUFFIX)(a, shift); |
587 | } | |
c017a39f | 588 | static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { |
f22341db | 589 | return CAT(_mm_srai_, SUFFIX)(a, shift); |
590 | } | |
c017a39f | 591 | static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } |
592 | static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, | |
f22341db | 593 | const EntryType e, const EntryType f, const EntryType g, const EntryType h) { |
594 | return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); | |
595 | } | |
596 | ||
c017a39f | 597 | static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { |
598 | v1 = add(mul(v1, v2), v3); | |
599 | } | |
f22341db | 600 | |
c017a39f | 601 | static Vc_INTRINSIC VectorType Vc_CONST abs(VTArg a) { return _mm_abs_epi16(a); } |
602 | static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); } | |
603 | static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epi16(a, b); } | |
604 | static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epi16(a, b); } | |
f22341db | 605 | |
c017a39f | 606 | static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) { |
f22341db | 607 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" |
c017a39f | 608 | VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); |
f22341db | 609 | a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); |
610 | a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
611 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
612 | } | |
c017a39f | 613 | static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) { |
f22341db | 614 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" |
c017a39f | 615 | VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); |
f22341db | 616 | a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); |
617 | a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
618 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
619 | } | |
c017a39f | 620 | static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) { |
621 | VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); | |
f22341db | 622 | a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); |
623 | a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
624 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
625 | } | |
c017a39f | 626 | static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) { |
627 | VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); | |
f22341db | 628 | a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); |
629 | a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
630 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
631 | } | |
632 | ||
c017a39f | 633 | static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); } |
634 | static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); } | |
635 | static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); } | |
636 | static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); } | |
637 | static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); } | |
638 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } | |
639 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } | |
640 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } | |
641 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } | |
f22341db | 642 | #undef SUFFIX |
c017a39f | 643 | static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; } |
f22341db | 644 | }; |
645 | ||
646 | template<> struct VectorHelper<unsigned short> { | |
647 | typedef VectorTypeHelper<unsigned short>::Type VectorType; | |
c017a39f | 648 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN |
649 | typedef const VectorType & VTArg; | |
650 | #else | |
651 | typedef const VectorType VTArg; | |
652 | #endif | |
f22341db | 653 | typedef unsigned short EntryType; |
654 | typedef unsigned int ConcatType; | |
655 | ||
c017a39f | 656 | static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); } |
657 | static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); } | |
658 | static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); } | |
659 | static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); } | |
660 | static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } | |
661 | static Vc_INTRINSIC VectorType Vc_CONST one() { return _mm_setone_epu16(); } | |
f22341db | 662 | |
c017a39f | 663 | static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); } |
664 | static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epu16(a, b); } | |
665 | static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epu16(a, b); } | |
f22341db | 666 | |
667 | #define SUFFIX epi16 | |
c017a39f | 668 | static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { |
f22341db | 669 | return CAT(_mm_slli_, SUFFIX)(a, shift); |
670 | } | |
c017a39f | 671 | static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { |
f22341db | 672 | return CAT(_mm_srli_, SUFFIX)(a, shift); |
673 | } | |
c017a39f | 674 | static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) { |
f22341db | 675 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" |
c017a39f | 676 | VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); |
f22341db | 677 | a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); |
678 | a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
679 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
680 | } | |
c017a39f | 681 | static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) { |
f22341db | 682 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" |
c017a39f | 683 | VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); |
f22341db | 684 | a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); |
685 | a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
686 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
687 | } | |
c017a39f | 688 | static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) { |
f22341db | 689 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" |
c017a39f | 690 | VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); |
f22341db | 691 | a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); |
692 | a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
693 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
694 | } | |
c017a39f | 695 | static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) { |
f22341db | 696 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" |
c017a39f | 697 | VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); |
f22341db | 698 | a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); |
699 | a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
700 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
701 | } | |
c017a39f | 702 | static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } |
703 | static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, | |
f22341db | 704 | const EntryType d, const EntryType e, const EntryType f, |
705 | const EntryType g, const EntryType h) { | |
706 | return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); | |
707 | } | |
c017a39f | 708 | static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } |
f22341db | 709 | |
c017a39f | 710 | static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); } |
711 | static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); } | |
712 | static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); } | |
713 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } | |
f22341db | 714 | |
715 | #ifndef USE_INCORRECT_UNSIGNED_COMPARE | |
c017a39f | 716 | static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epu16(a, b); } |
717 | static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epu16(a, b); } | |
f22341db | 718 | #else |
c017a39f | 719 | static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); } |
720 | static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); } | |
f22341db | 721 | #endif |
c017a39f | 722 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } |
723 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } | |
724 | static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } | |
f22341db | 725 | #undef SUFFIX |
c017a39f | 726 | static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; } |
f22341db | 727 | }; |
728 | #undef OP1 | |
729 | #undef OP | |
730 | #undef OP_ | |
731 | #undef OPx | |
732 | #undef OPcmp | |
733 | ||
734 | template<> struct VectorHelper<char> | |
735 | { | |
736 | typedef VectorTypeHelper<char>::Type VectorType; | |
c017a39f | 737 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN |
738 | typedef const VectorType & VTArg; | |
739 | #else | |
740 | typedef const VectorType VTArg; | |
741 | #endif | |
f22341db | 742 | typedef char EntryType; |
743 | typedef short ConcatType; | |
744 | }; | |
745 | ||
746 | template<> struct VectorHelper<unsigned char> | |
747 | { | |
748 | typedef VectorTypeHelper<unsigned char>::Type VectorType; | |
c017a39f | 749 | #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN |
750 | typedef const VectorType & VTArg; | |
751 | #else | |
752 | typedef const VectorType VTArg; | |
753 | #endif | |
f22341db | 754 | typedef unsigned char EntryType; |
755 | typedef unsigned short ConcatType; | |
756 | }; | |
757 | ||
758 | } // namespace AVX | |
759 | } // namespace Vc | |
c017a39f | 760 | } // namespace AliRoot |
f22341db | 761 | |
762 | #include "vectorhelper.tcc" | |
c017a39f | 763 | #include "undomacros.h" |
f22341db | 764 | |
765 | #endif // AVX_VECTORHELPER_H |