]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #ifndef AVX_VECTORHELPER_H | |
21 | #define AVX_VECTORHELPER_H | |
22 | ||
23 | #include <limits> | |
24 | #include "types.h" | |
25 | #include "intrinsics.h" | |
26 | #include "casts.h" | |
27 | ||
28 | namespace Vc | |
29 | { | |
30 | namespace AVX | |
31 | { | |
32 | #define OP0(name, code) static inline VectorType name() { return code; } | |
33 | #define OP1(name, code) static inline VectorType name(const VectorType &a) { return code; } | |
34 | #define OP2(name, code) static inline VectorType name(const VectorType &a, const VectorType &b) { return code; } | |
35 | #define OP3(name, code) static inline VectorType name(const VectorType &a, const VectorType &b, const VectorType &c) { return code; } | |
36 | ||
37 | template<> struct VectorHelper<_M256> | |
38 | { | |
39 | typedef _M256 VectorType; | |
40 | template<typename A> static VectorType load(const float *x, A) PURE; | |
41 | static void store(float *mem, const VectorType x, AlignedFlag); | |
42 | static void store(float *mem, const VectorType x, UnalignedFlag); | |
43 | static void store(float *mem, const VectorType x, StreamingAndAlignedFlag); | |
44 | static void store(float *mem, const VectorType x, StreamingAndUnalignedFlag); | |
45 | static void store(float *mem, const VectorType x, const VectorType m, AlignedFlag); | |
46 | static void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag); | |
47 | static void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag); | |
48 | static void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag); | |
49 | ||
50 | static VectorType cdab(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); } | |
51 | static VectorType badc(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); } | |
52 | static VectorType aaaa(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); } | |
53 | static VectorType bbbb(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); } | |
54 | static VectorType cccc(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); } | |
55 | static VectorType dddd(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); } | |
56 | static VectorType dacb(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); } | |
57 | ||
58 | OP0(allone, _mm256_setallone_ps()) | |
59 | OP0(zero, _mm256_setzero_ps()) | |
60 | OP2(or_, _mm256_or_ps(a, b)) | |
61 | OP2(xor_, _mm256_xor_ps(a, b)) | |
62 | OP2(and_, _mm256_and_ps(a, b)) | |
63 | OP2(andnot_, _mm256_andnot_ps(a, b)) | |
64 | OP3(blend, _mm256_blendv_ps(a, b, c)) | |
65 | }; | |
66 | ||
67 | template<> struct VectorHelper<_M256D> | |
68 | { | |
69 | typedef _M256D VectorType; | |
70 | template<typename A> static VectorType load(const double *x, A) PURE; | |
71 | static void store(double *mem, const VectorType x, AlignedFlag); | |
72 | static void store(double *mem, const VectorType x, UnalignedFlag); | |
73 | static void store(double *mem, const VectorType x, StreamingAndAlignedFlag); | |
74 | static void store(double *mem, const VectorType x, StreamingAndUnalignedFlag); | |
75 | static void store(double *mem, const VectorType x, const VectorType m, AlignedFlag); | |
76 | static void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag); | |
77 | static void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag); | |
78 | static void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag); | |
79 | ||
80 | static VectorType cdab(VectorType x) { return _mm256_permute_pd(x, 5); } | |
81 | static VectorType badc(VectorType x) { return _mm256_permute2f128_pd(x, x, 1); } | |
82 | // aaaa bbbb cccc dddd specialized in vector.tcc | |
83 | static VectorType dacb(VectorType x) { | |
84 | const __m128d cb = avx_cast<__m128d>(_mm_alignr_epi8(avx_cast<__m128i>(lo128(x)), | |
85 | avx_cast<__m128i>(hi128(x)), sizeof(double))); // XXX: lo and hi swapped? | |
86 | const __m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped? | |
87 | return concat(cb, da); | |
88 | } | |
89 | ||
90 | OP0(allone, _mm256_setallone_pd()) | |
91 | OP0(zero, _mm256_setzero_pd()) | |
92 | OP2(or_, _mm256_or_pd(a, b)) | |
93 | OP2(xor_, _mm256_xor_pd(a, b)) | |
94 | OP2(and_, _mm256_and_pd(a, b)) | |
95 | OP2(andnot_, _mm256_andnot_pd(a, b)) | |
96 | OP3(blend, _mm256_blendv_pd(a, b, c)) | |
97 | }; | |
98 | ||
99 | template<> struct VectorHelper<_M256I> | |
100 | { | |
101 | typedef _M256I VectorType; | |
102 | template<typename T> static VectorType load(const T *x, AlignedFlag) PURE; | |
103 | template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE; | |
104 | template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE; | |
105 | template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE; | |
106 | template<typename T> static void store(T *mem, const VectorType x, AlignedFlag); | |
107 | template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag); | |
108 | template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag); | |
109 | template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag); | |
110 | template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag); | |
111 | template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag); | |
112 | template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag); | |
113 | template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag); | |
114 | ||
115 | static VectorType cdab(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(2, 3, 0, 1))); } | |
116 | static VectorType badc(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(1, 0, 3, 2))); } | |
117 | static VectorType aaaa(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(0, 0, 0, 0))); } | |
118 | static VectorType bbbb(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(1, 1, 1, 1))); } | |
119 | static VectorType cccc(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(2, 2, 2, 2))); } | |
120 | static VectorType dddd(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(3, 3, 3, 3))); } | |
121 | static VectorType dacb(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(3, 0, 2, 1))); } | |
122 | ||
123 | OP0(allone, _mm256_setallone_si256()) | |
124 | OP0(zero, _mm256_setzero_si256()) | |
125 | OP2(or_, _mm256_or_si256(a, b)) | |
126 | OP2(xor_, _mm256_xor_si256(a, b)) | |
127 | OP2(and_, _mm256_and_si256(a, b)) | |
128 | OP2(andnot_, _mm256_andnot_si256(a, b)) | |
129 | OP3(blend, _mm256_blendv_epi8(a, b, c)) | |
130 | }; | |
131 | ||
132 | template<> struct VectorHelper<__m128i> | |
133 | { | |
134 | typedef __m128i VectorType; | |
135 | template<typename T> static VectorType load(const T *x, AlignedFlag) PURE; | |
136 | template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE; | |
137 | template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE; | |
138 | template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE; | |
139 | template<typename T> static void store(T *mem, const VectorType x, AlignedFlag); | |
140 | template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag); | |
141 | template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag); | |
142 | template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag); | |
143 | template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag); | |
144 | template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag); | |
145 | template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag); | |
146 | template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag); | |
147 | ||
148 | static VectorType cdab(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); } | |
149 | static VectorType badc(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)), _MM_SHUFFLE(1, 0, 3, 2)); } | |
150 | static VectorType aaaa(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0)); } | |
151 | static VectorType bbbb(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)), _MM_SHUFFLE(1, 1, 1, 1)); } | |
152 | static VectorType cccc(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(2, 2, 2, 2)); } | |
153 | static VectorType dddd(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); } | |
154 | static VectorType dacb(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)), _MM_SHUFFLE(3, 0, 2, 1)); } | |
155 | ||
156 | OP0(allone, _mm_setallone_si128()) | |
157 | OP0(zero, _mm_setzero_si128()) | |
158 | OP2(or_, _mm_or_si128(a, b)) | |
159 | OP2(xor_, _mm_xor_si128(a, b)) | |
160 | OP2(and_, _mm_and_si128(a, b)) | |
161 | OP2(andnot_, _mm_andnot_si128(a, b)) | |
162 | OP3(blend, _mm_blendv_epi8(a, b, c)) | |
163 | }; | |
164 | #undef OP1 | |
165 | #undef OP2 | |
166 | #undef OP3 | |
167 | ||
168 | #define OP1(op) \ | |
169 | static inline VectorType INTRINSIC CONST op(const VectorType &a) { return CAT(_mm256_##op##_, SUFFIX)(a); } | |
170 | #define OP(op) \ | |
171 | static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); } | |
172 | #define OP_(op) \ | |
173 | static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op , SUFFIX)(a, b); } | |
174 | #define OPx(op, op2) \ | |
175 | static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); } | |
176 | #define OPcmp(op) \ | |
177 | static inline VectorType INTRINSIC CONST cmp##op(const VectorType &a, const VectorType &b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); } | |
178 | #define OP_CAST_(op) \ | |
179 | static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_castps_, SUFFIX)( \ | |
180 | _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \ | |
181 | CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \ | |
182 | } | |
183 | #define MINMAX \ | |
184 | static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \ | |
185 | static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return CAT(_mm256_max_, SUFFIX)(a, b); } | |
186 | ||
187 | template<> struct VectorHelper<double> { | |
188 | typedef _M256D VectorType; | |
189 | typedef double EntryType; | |
190 | typedef double ConcatType; | |
191 | #define SUFFIX pd | |
192 | ||
193 | static inline VectorType notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); } | |
194 | static inline VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); } | |
195 | static inline VectorType set(const double a, const double b, const double c, const double d) { | |
196 | return CAT(_mm256_set_, SUFFIX)(a, b, c, d); | |
197 | } | |
198 | static inline VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } | |
199 | static inline VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); } | |
200 | ||
201 | static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } | |
202 | static inline VectorType mul(VectorType a, VectorType b, _M256 _mask) { | |
203 | _M256D mask = _mm256_castps_pd(_mask); | |
204 | return _mm256_or_pd( | |
205 | _mm256_and_pd(mask, _mm256_mul_pd(a, b)), | |
206 | _mm256_andnot_pd(mask, a) | |
207 | ); | |
208 | } | |
209 | static inline VectorType div(VectorType a, VectorType b, _M256 _mask) { | |
210 | _M256D mask = _mm256_castps_pd(_mask); | |
211 | return _mm256_or_pd( | |
212 | _mm256_and_pd(mask, _mm256_div_pd(a, b)), | |
213 | _mm256_andnot_pd(mask, a) | |
214 | ); | |
215 | } | |
216 | ||
217 | OP(add) OP(sub) OP(mul) | |
218 | OPcmp(eq) OPcmp(neq) | |
219 | OPcmp(lt) OPcmp(nlt) | |
220 | OPcmp(le) OPcmp(nle) | |
221 | ||
222 | OP1(sqrt) | |
223 | static inline VectorType rsqrt(VectorType x) { | |
224 | return _mm256_div_pd(one(), sqrt(x)); | |
225 | } | |
226 | static inline VectorType reciprocal(VectorType x) { | |
227 | return _mm256_div_pd(one(), x); | |
228 | } | |
229 | static inline VectorType isNaN(VectorType x) { | |
230 | return _mm256_cmpunord_pd(x, x); | |
231 | } | |
232 | static inline VectorType isFinite(VectorType x) { | |
233 | return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x)); | |
234 | } | |
235 | static inline VectorType abs(const VectorType a) { | |
236 | return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd()); | |
237 | } | |
238 | ||
239 | MINMAX | |
240 | static inline EntryType min(VectorType a) { | |
241 | __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); | |
242 | b = _mm_min_sd(b, _mm_unpackhi_pd(b, b)); | |
243 | return _mm_cvtsd_f64(b); | |
244 | } | |
245 | static inline EntryType max(VectorType a) { | |
246 | __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); | |
247 | b = _mm_max_sd(b, _mm_unpackhi_pd(b, b)); | |
248 | return _mm_cvtsd_f64(b); | |
249 | } | |
250 | static inline EntryType mul(VectorType a) { | |
251 | __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); | |
252 | b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); | |
253 | return _mm_cvtsd_f64(b); | |
254 | } | |
255 | static inline EntryType add(VectorType a) { | |
256 | __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); | |
257 | b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); | |
258 | return _mm_cvtsd_f64(b); | |
259 | } | |
260 | #undef SUFFIX | |
261 | static inline VectorType round(VectorType a) { | |
262 | return _mm256_round_pd(a, _MM_FROUND_NINT); | |
263 | } | |
264 | }; | |
265 | ||
266 | template<> struct VectorHelper<float> { | |
267 | typedef float EntryType; | |
268 | typedef _M256 VectorType; | |
269 | typedef double ConcatType; | |
270 | #define SUFFIX ps | |
271 | ||
272 | static inline VectorType notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); } | |
273 | static inline VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); } | |
274 | static inline VectorType set(const float a, const float b, const float c, const float d, | |
275 | const float e, const float f, const float g, const float h) { | |
276 | return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } | |
277 | static inline VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } | |
278 | static inline VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); } | |
279 | static inline _M256 concat(_M256D a, _M256D b) { return _mm256_insertf128_ps(avx_cast<_M256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); } | |
280 | ||
281 | static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } | |
282 | static inline VectorType mul(VectorType a, VectorType b, _M256 mask) { | |
283 | return _mm256_or_ps( | |
284 | _mm256_and_ps(mask, _mm256_mul_ps(a, b)), | |
285 | _mm256_andnot_ps(mask, a) | |
286 | ); | |
287 | } | |
288 | static inline VectorType div(VectorType a, VectorType b, _M256 mask) { | |
289 | return _mm256_or_ps( | |
290 | _mm256_and_ps(mask, _mm256_div_ps(a, b)), | |
291 | _mm256_andnot_ps(mask, a) | |
292 | ); | |
293 | } | |
294 | ||
295 | OP(add) OP(sub) OP(mul) | |
296 | OPcmp(eq) OPcmp(neq) | |
297 | OPcmp(lt) OPcmp(nlt) | |
298 | OPcmp(le) OPcmp(nle) | |
299 | ||
300 | OP1(sqrt) OP1(rsqrt) | |
301 | static inline VectorType isNaN(VectorType x) { | |
302 | return _mm256_cmpunord_ps(x, x); | |
303 | } | |
304 | static inline VectorType isFinite(VectorType x) { | |
305 | return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x)); | |
306 | } | |
307 | static inline VectorType reciprocal(VectorType x) { | |
308 | return _mm256_rcp_ps(x); | |
309 | } | |
310 | static inline VectorType abs(const VectorType a) { | |
311 | return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps()); | |
312 | } | |
313 | ||
314 | MINMAX | |
315 | static inline EntryType min(VectorType a) { | |
316 | __m128 b = _mm_min_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); | |
317 | b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) | |
318 | b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3 | |
319 | return _mm_cvtss_f32(b); | |
320 | } | |
321 | static inline EntryType max(VectorType a) { | |
322 | __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); | |
323 | b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) | |
324 | b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3 | |
325 | return _mm_cvtss_f32(b); | |
326 | } | |
327 | static inline EntryType mul(VectorType a) { | |
328 | __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); | |
329 | b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); | |
330 | b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); | |
331 | return _mm_cvtss_f32(b); | |
332 | } | |
333 | static inline EntryType add(VectorType a) { | |
334 | __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); | |
335 | b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); | |
336 | b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); | |
337 | return _mm_cvtss_f32(b); | |
338 | } | |
339 | #undef SUFFIX | |
340 | static inline VectorType round(VectorType a) { | |
341 | return _mm256_round_ps(a, _MM_FROUND_NINT); | |
342 | } | |
343 | }; | |
344 | ||
345 | template<> struct VectorHelper<sfloat> : public VectorHelper<float> {}; | |
346 | ||
347 | template<> struct VectorHelper<int> { | |
348 | typedef int EntryType; | |
349 | typedef _M256I VectorType; | |
350 | typedef long long ConcatType; | |
351 | #define SUFFIX si256 | |
352 | ||
353 | OP_(or_) OP_(and_) OP_(xor_) | |
354 | static inline VectorType INTRINSIC CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } | |
355 | static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } | |
356 | #undef SUFFIX | |
357 | #define SUFFIX epi32 | |
358 | static inline VectorType INTRINSIC CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } | |
359 | ||
360 | static inline VectorType INTRINSIC CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); } | |
361 | static inline VectorType INTRINSIC CONST set(const int a, const int b, const int c, const int d, | |
362 | const int e, const int f, const int g, const int h) { | |
363 | return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } | |
364 | ||
365 | static inline void INTRINSIC CONST multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } | |
366 | ||
367 | static inline VectorType shiftLeft(VectorType a, int shift) { | |
368 | return CAT(_mm256_slli_, SUFFIX)(a, shift); | |
369 | } | |
370 | static inline VectorType shiftRight(VectorType a, int shift) { | |
371 | return CAT(_mm256_srai_, SUFFIX)(a, shift); | |
372 | } | |
373 | OP1(abs) | |
374 | ||
375 | MINMAX | |
376 | static inline EntryType INTRINSIC CONST min(VectorType a) { | |
377 | __m128i b = _mm_min_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1)); | |
378 | b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
379 | b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here | |
380 | return _mm_cvtsi128_si32(b); | |
381 | } | |
382 | static inline EntryType INTRINSIC CONST max(VectorType a) { | |
383 | __m128i b = _mm_max_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1)); | |
384 | b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
385 | b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here | |
386 | return _mm_cvtsi128_si32(b); | |
387 | } | |
388 | static inline EntryType INTRINSIC CONST add(VectorType a) { | |
389 | __m128i b = _mm_add_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1)); | |
390 | b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
391 | b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
392 | return _mm_cvtsi128_si32(b); | |
393 | } | |
394 | static inline EntryType INTRINSIC CONST mul(VectorType a) { | |
395 | __m128i b = _mm_mullo_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1)); | |
396 | b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
397 | b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
398 | return _mm_cvtsi128_si32(b); | |
399 | } | |
400 | ||
401 | static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm256_mullo_epi32(a, b); } | |
402 | ||
403 | OP(add) OP(sub) | |
404 | OPcmp(eq) | |
405 | OPcmp(lt) | |
406 | OPcmp(gt) | |
407 | static inline VectorType INTRINSIC CONST cmpneq(const VectorType &a, const VectorType &b) { _M256I x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } | |
408 | static inline VectorType INTRINSIC CONST cmpnlt(const VectorType &a, const VectorType &b) { _M256I x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } | |
409 | static inline VectorType INTRINSIC CONST cmple (const VectorType &a, const VectorType &b) { _M256I x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } | |
410 | static inline VectorType INTRINSIC CONST cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); } | |
411 | #undef SUFFIX | |
412 | static inline VectorType INTRINSIC CONST round(VectorType a) { return a; } | |
413 | }; | |
414 | ||
415 | template<> struct VectorHelper<unsigned int> { | |
416 | typedef unsigned int EntryType; | |
417 | typedef _M256I VectorType; | |
418 | typedef unsigned long long ConcatType; | |
419 | #define SUFFIX si256 | |
420 | OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) | |
421 | static inline VectorType INTRINSIC CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } | |
422 | static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } | |
423 | ||
424 | #undef SUFFIX | |
425 | #define SUFFIX epu32 | |
426 | static inline VectorType INTRINSIC CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } | |
427 | ||
428 | MINMAX | |
429 | static inline EntryType INTRINSIC CONST min(VectorType a) { | |
430 | __m128i b = _mm_min_epu32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1)); | |
431 | b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
432 | b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here | |
433 | return _mm_cvtsi128_si32(b); | |
434 | } | |
435 | static inline EntryType INTRINSIC CONST max(VectorType a) { | |
436 | __m128i b = _mm_max_epu32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1)); | |
437 | b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
438 | b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here | |
439 | return _mm_cvtsi128_si32(b); | |
440 | } | |
441 | static inline EntryType INTRINSIC CONST add(VectorType a) { | |
442 | __m128i b = _mm_add_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1)); | |
443 | b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
444 | b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
445 | return _mm_cvtsi128_si32(b); | |
446 | } | |
447 | static inline EntryType INTRINSIC CONST mul(VectorType a) { | |
448 | __m128i b = _mm_mullo_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1)); | |
449 | b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
450 | b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); | |
451 | return _mm_cvtsi128_si32(b); | |
452 | } | |
453 | ||
454 | static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm256_mullo_epi32(a, b); } | |
455 | ||
456 | #undef SUFFIX | |
457 | #define SUFFIX epi32 | |
458 | static inline VectorType shiftLeft(VectorType a, int shift) { | |
459 | return CAT(_mm256_slli_, SUFFIX)(a, shift); | |
460 | } | |
461 | static inline VectorType shiftRight(VectorType a, int shift) { | |
462 | return CAT(_mm256_srli_, SUFFIX)(a, shift); | |
463 | } | |
464 | static inline VectorType INTRINSIC CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); } | |
465 | static inline VectorType INTRINSIC CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d, | |
466 | const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) { | |
467 | return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } | |
468 | ||
469 | OP(add) OP(sub) | |
470 | OPcmp(eq) | |
471 | static inline VectorType INTRINSIC CONST cmpneq(const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); } | |
472 | ||
473 | #ifndef USE_INCORRECT_UNSIGNED_COMPARE | |
474 | static inline VectorType INTRINSIC CONST cmplt(const VectorType &a, const VectorType &b) { | |
475 | return _mm256_cmplt_epu32(a, b); | |
476 | } | |
477 | static inline VectorType INTRINSIC CONST cmpgt(const VectorType &a, const VectorType &b) { | |
478 | return _mm256_cmpgt_epu32(a, b); | |
479 | } | |
480 | #else | |
481 | OPcmp(lt) | |
482 | OPcmp(gt) | |
483 | #endif | |
484 | static inline VectorType INTRINSIC CONST cmpnlt(const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); } | |
485 | static inline VectorType INTRINSIC CONST cmple (const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); } | |
486 | static inline VectorType INTRINSIC CONST cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); } | |
487 | ||
488 | #undef SUFFIX | |
489 | static inline VectorType INTRINSIC CONST round(VectorType a) { return a; } | |
490 | }; | |
491 | ||
492 | template<> struct VectorHelper<signed short> { | |
493 | typedef VectorTypeHelper<signed short>::Type VectorType; | |
494 | typedef signed short EntryType; | |
495 | typedef int ConcatType; | |
496 | ||
497 | static inline VectorType INTRINSIC CONST or_(VectorType a, VectorType b) { return _mm_or_si128(a, b); } | |
498 | static inline VectorType INTRINSIC CONST and_(VectorType a, VectorType b) { return _mm_and_si128(a, b); } | |
499 | static inline VectorType INTRINSIC CONST xor_(VectorType a, VectorType b) { return _mm_xor_si128(a, b); } | |
500 | static inline VectorType INTRINSIC CONST zero() { return _mm_setzero_si128(); } | |
501 | static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, __m128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } | |
502 | ||
503 | #define SUFFIX epi16 | |
504 | static inline VectorType INTRINSIC CONST one() { return CAT(_mm_setone_, SUFFIX)(); } | |
505 | ||
506 | static inline VectorType shiftLeft(VectorType a, int shift) { | |
507 | return CAT(_mm_slli_, SUFFIX)(a, shift); | |
508 | } | |
509 | static inline VectorType shiftRight(VectorType a, int shift) { | |
510 | return CAT(_mm_srai_, SUFFIX)(a, shift); | |
511 | } | |
512 | static inline VectorType INTRINSIC CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } | |
513 | static inline VectorType INTRINSIC CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, | |
514 | const EntryType e, const EntryType f, const EntryType g, const EntryType h) { | |
515 | return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); | |
516 | } | |
517 | ||
518 | static inline void INTRINSIC CONST multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { | |
519 | v1 = add(mul(v1, v2), v3); } | |
520 | ||
521 | static inline VectorType INTRINSIC CONST abs(VectorType a) { return _mm_abs_epi16(a); } | |
522 | static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm_mullo_epi16(a, b); } | |
523 | static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return _mm_min_epi16(a, b); } | |
524 | static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return _mm_max_epi16(a, b); } | |
525 | ||
526 | static inline EntryType INTRINSIC CONST min(VectorType a) { | |
527 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" | |
528 | a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
529 | a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
530 | a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
531 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
532 | } | |
533 | static inline EntryType INTRINSIC CONST max(VectorType a) { | |
534 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" | |
535 | a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
536 | a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
537 | a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
538 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
539 | } | |
540 | static inline EntryType INTRINSIC CONST mul(VectorType a) { | |
541 | a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
542 | a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
543 | a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
544 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
545 | } | |
546 | static inline EntryType INTRINSIC CONST add(VectorType a) { | |
547 | a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
548 | a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
549 | a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
550 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
551 | } | |
552 | ||
553 | static inline VectorType INTRINSIC CONST add(VectorType a, VectorType b) { return _mm_add_epi16(a, b); } | |
554 | static inline VectorType INTRINSIC CONST sub(VectorType a, VectorType b) { return _mm_sub_epi16(a, b); } | |
555 | static inline VectorType INTRINSIC CONST cmpeq(VectorType a, VectorType b) { return _mm_cmpeq_epi16(a, b); } | |
556 | static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epi16(a, b); } | |
557 | static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epi16(a, b); } | |
558 | static inline VectorType cmpneq(const VectorType &a, const VectorType &b) { __m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } | |
559 | static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) { __m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } | |
560 | static inline VectorType cmple (const VectorType &a, const VectorType &b) { __m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } | |
561 | static inline VectorType cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); } | |
562 | #undef SUFFIX | |
563 | static inline VectorType round(VectorType a) { return a; } | |
564 | }; | |
565 | ||
566 | template<> struct VectorHelper<unsigned short> { | |
567 | typedef VectorTypeHelper<unsigned short>::Type VectorType; | |
568 | typedef unsigned short EntryType; | |
569 | typedef unsigned int ConcatType; | |
570 | ||
571 | static inline VectorType INTRINSIC CONST or_(VectorType a, VectorType b) { return _mm_or_si128(a, b); } | |
572 | static inline VectorType INTRINSIC CONST and_(VectorType a, VectorType b) { return _mm_and_si128(a, b); } | |
573 | static inline VectorType INTRINSIC CONST xor_(VectorType a, VectorType b) { return _mm_xor_si128(a, b); } | |
574 | static inline VectorType INTRINSIC CONST zero() { return _mm_setzero_si128(); } | |
575 | static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, __m128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } | |
576 | static inline VectorType INTRINSIC CONST one() { return _mm_setone_epu16(); } | |
577 | ||
578 | static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm_mullo_epi16(a, b); } | |
579 | static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return _mm_min_epu16(a, b); } | |
580 | static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return _mm_max_epu16(a, b); } | |
581 | ||
582 | #define SUFFIX epi16 | |
583 | static inline VectorType shiftLeft(VectorType a, int shift) { | |
584 | return CAT(_mm_slli_, SUFFIX)(a, shift); | |
585 | } | |
586 | static inline VectorType shiftRight(VectorType a, int shift) { | |
587 | return CAT(_mm_srli_, SUFFIX)(a, shift); | |
588 | } | |
589 | static inline EntryType INTRINSIC CONST min(VectorType a) { | |
590 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" | |
591 | a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
592 | a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
593 | a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
594 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
595 | } | |
596 | static inline EntryType INTRINSIC CONST max(VectorType a) { | |
597 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" | |
598 | a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
599 | a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
600 | a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
601 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
602 | } | |
603 | static inline EntryType INTRINSIC CONST mul(VectorType a) { | |
604 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" | |
605 | a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
606 | a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
607 | a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
608 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
609 | } | |
610 | static inline EntryType INTRINSIC CONST add(VectorType a) { | |
611 | // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" | |
612 | a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
613 | a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); | |
614 | a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); | |
615 | return _mm_cvtsi128_si32(a); // & 0xffff is implicit | |
616 | } | |
617 | static inline VectorType INTRINSIC CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } | |
618 | static inline VectorType INTRINSIC CONST set(const EntryType a, const EntryType b, const EntryType c, | |
619 | const EntryType d, const EntryType e, const EntryType f, | |
620 | const EntryType g, const EntryType h) { | |
621 | return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); | |
622 | } | |
623 | ||
624 | static inline VectorType INTRINSIC CONST add(VectorType a, VectorType b) { return _mm_add_epi16(a, b); } | |
625 | static inline VectorType INTRINSIC CONST sub(VectorType a, VectorType b) { return _mm_sub_epi16(a, b); } | |
626 | static inline VectorType INTRINSIC CONST cmpeq(VectorType a, VectorType b) { return _mm_cmpeq_epi16(a, b); } | |
627 | static inline VectorType cmpneq(const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } | |
628 | ||
629 | #ifndef USE_INCORRECT_UNSIGNED_COMPARE | |
630 | static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epu16(a, b); } | |
631 | static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epu16(a, b); } | |
632 | #else | |
633 | static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epi16(a, b); } | |
634 | static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epi16(a, b); } | |
635 | #endif | |
636 | static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } | |
637 | static inline VectorType cmple (const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } | |
638 | static inline VectorType cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); } | |
639 | #undef SUFFIX | |
640 | static inline VectorType round(VectorType a) { return a; } | |
641 | }; | |
642 | #undef OP1 | |
643 | #undef OP | |
644 | #undef OP_ | |
645 | #undef OPx | |
646 | #undef OPcmp | |
647 | ||
648 | template<> struct VectorHelper<char> | |
649 | { | |
650 | typedef VectorTypeHelper<char>::Type VectorType; | |
651 | typedef char EntryType; | |
652 | typedef short ConcatType; | |
653 | }; | |
654 | ||
655 | template<> struct VectorHelper<unsigned char> | |
656 | { | |
657 | typedef VectorTypeHelper<unsigned char>::Type VectorType; | |
658 | typedef unsigned char EntryType; | |
659 | typedef unsigned short ConcatType; | |
660 | }; | |
661 | ||
662 | } // namespace AVX | |
663 | } // namespace Vc | |
664 | ||
665 | #include "vectorhelper.tcc" | |
666 | ||
667 | #endif // AVX_VECTORHELPER_H |