]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/include/Vc/sse/intrinsics.h
update to Vc 0.7.3-dev
[u/mrichter/AliRoot.git] / Vc / include / Vc / sse / intrinsics.h
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
20#ifndef SSE_INTRINSICS_H
21#define SSE_INTRINSICS_H
22
23#include "../common/windows_fix_intrin.h"
24
c017a39f 25// The GCC xxxintrin.h headers do not make sure that the intrinsics have C linkage. This not really
26// a problem, unless there is another place where the exact same functions are declared. Then the
27// linkage must be the same, otherwise it won't compile. Such a case occurs on Windows, where the
28// intrin.h header (included indirectly via unistd.h) declares many SSE intrinsics again.
29extern "C" {
f22341db 30// MMX
31#include <mmintrin.h>
32// SSE
33#include <xmmintrin.h>
34// SSE2
35#include <emmintrin.h>
c017a39f 36}
37
38#include "../common/fix_clang_emmintrin.h"
f22341db 39
40#if defined(__GNUC__) && !defined(VC_IMPL_SSE2)
41#error "SSE Vector class needs at least SSE2"
42#endif
43
44#include "const_data.h"
f22341db 45#include <cstdlib>
c017a39f 46#include "macros.h"
f22341db 47
48#ifdef __3dNOW__
c017a39f 49extern "C" {
f22341db 50#include <mm3dnow.h>
c017a39f 51}
f22341db 52#endif
53
c017a39f 54namespace AliRoot {
f22341db 55namespace Vc
56{
57namespace SSE
58{
59 enum VectorAlignmentEnum { VectorAlignment = 16 };
60
61#if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT)
c017a39f 62 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
63 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
64 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
65 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
66 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
67 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
68#endif
69
70#ifdef VC_GCC
71 // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
72 // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
73 static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
74 static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
75 static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
76 static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
77 static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
78 static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
f22341db 79#endif
80
81#if defined(VC_GNU_ASM) && !defined(NVALGRIND)
c017a39f 82 static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; }
f22341db 83#else
c017a39f 84 static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); }
f22341db 85#endif
c017a39f 86 static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); }
87 static Vc_INTRINSIC __m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); }
88 static Vc_INTRINSIC __m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); }
89
90 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); }
91 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); }
92 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
93 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
94 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
95 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
96
97 static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
98 static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
99
100 static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
101 static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
102 static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
103 static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
104
105 //X static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi8 () { return _mm_slli_epi8 (_mm_setallone_si128(), 7); }
106 static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
107 static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
108
109 //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu8 (__m128i a, __m128i b) { return _mm_cmplt_epi8 (
f22341db 110 //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); }
c017a39f 111 //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu8 (__m128i a, __m128i b) { return _mm_cmpgt_epi8 (
f22341db 112 //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); }
c017a39f 113 static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16(
f22341db 114 _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); }
c017a39f 115 static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(
f22341db 116 _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); }
c017a39f 117 static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32(
f22341db 118 _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); }
c017a39f 119 static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(
f22341db 120 _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); }
121} // namespace SSE
122} // namespace Vc
c017a39f 123} // namespace AliRoot
f22341db 124
125// SSE3
126#ifdef VC_IMPL_SSE3
c017a39f 127extern "C" {
f22341db 128#include <pmmintrin.h>
c017a39f 129}
f22341db 130#elif defined _PMMINTRIN_H_INCLUDED
131#error "SSE3 was disabled but something includes <pmmintrin.h>. Please fix your code."
132#endif
133// SSSE3
134#ifdef VC_IMPL_SSSE3
c017a39f 135extern "C" {
f22341db 136#include <tmmintrin.h>
c017a39f 137}
138namespace AliRoot {
f22341db 139namespace Vc
140{
141namespace SSE
142{
143
144 // not overriding _mm_set1_epi8 because this one should only be used for non-constants
c017a39f 145 static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) {
f22341db 146#if defined(VC_GCC) && VC_GCC < 0x40500
147 return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128());
148#else
149 // GCC 4.5 nows about the pshufb improvement
150 return _mm_set1_epi8(a);
151#endif
152 }
153
154} // namespace SSE
155} // namespace Vc
c017a39f 156} // namespace AliRoot
f22341db 157#elif defined _TMMINTRIN_H_INCLUDED
158#error "SSSE3 was disabled but something includes <tmmintrin.h>. Please fix your code."
159#else
c017a39f 160namespace AliRoot {
f22341db 161namespace Vc
162{
163namespace SSE
164{
c017a39f 165 static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi8 (__m128i a) {
f22341db 166 __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
167 return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_setone_epi8()));
168 }
169 // positive value:
170 // negative == 0
171 // a unchanged after xor
172 // 0 >> 31 -> 0
173 // a + 0 -> a
174 // negative value:
175 // negative == -1
176 // a xor -1 -> -a - 1
177 // -1 >> 31 -> 1
178 // -a - 1 + 1 -> -a
c017a39f 179 static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi16(__m128i a) {
f22341db 180 __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
181 return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
182 }
c017a39f 183 static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi32(__m128i a) {
f22341db 184 __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
185 return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
186 }
c017a39f 187 static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) {
f22341db 188 return _mm_set1_epi8(a);
189 }
c017a39f 190 static Vc_INTRINSIC __m128i Vc_CONST _mm_alignr_epi8(__m128i a, __m128i b, const int s) {
f22341db 191 switch (s) {
192 case 0: return b;
193 case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
194 case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
195 case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
196 case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
197 case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
198 case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
199 case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
200 case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
201 case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
202 case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
203 case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
204 case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
205 case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
206 case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
207 case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
208 case 16: return a;
209 case 17: return _mm_srli_si128(a, 1);
210 case 18: return _mm_srli_si128(a, 2);
211 case 19: return _mm_srli_si128(a, 3);
212 case 20: return _mm_srli_si128(a, 4);
213 case 21: return _mm_srli_si128(a, 5);
214 case 22: return _mm_srli_si128(a, 6);
215 case 23: return _mm_srli_si128(a, 7);
216 case 24: return _mm_srli_si128(a, 8);
217 case 25: return _mm_srli_si128(a, 9);
218 case 26: return _mm_srli_si128(a, 10);
219 case 27: return _mm_srli_si128(a, 11);
220 case 28: return _mm_srli_si128(a, 12);
221 case 29: return _mm_srli_si128(a, 13);
222 case 30: return _mm_srli_si128(a, 14);
223 case 31: return _mm_srli_si128(a, 15);
224 }
225 return _mm_setzero_si128();
226 }
227
228} // namespace SSE
229} // namespace Vc
c017a39f 230} // namespace AliRoot
f22341db 231
232#endif
233
234// SSE4.1
235#ifdef VC_IMPL_SSE4_1
c017a39f 236extern "C" {
f22341db 237#include <smmintrin.h>
c017a39f 238}
f22341db 239#else
240#ifdef _SMMINTRIN_H_INCLUDED
241#error "SSE4.1 was disabled but something includes <smmintrin.h>. Please fix your code."
242#endif
c017a39f 243namespace AliRoot {
f22341db 244namespace Vc
245{
246namespace SSE
247{
c017a39f 248 static Vc_INTRINSIC __m128d _mm_blendv_pd(__m128d a, __m128d b, __m128d c) {
f22341db 249 return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
250 }
c017a39f 251 static Vc_INTRINSIC __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c) {
f22341db 252 return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
253 }
c017a39f 254 static Vc_INTRINSIC __m128i _mm_blendv_epi8(__m128i a, __m128i b, __m128i c) {
f22341db 255 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
256 }
257
258 // only use the following blend functions with immediates as mask and, of course, compiling
259 // with optimization
c017a39f 260 static Vc_INTRINSIC __m128d _mm_blend_pd(__m128d a, __m128d b, const int mask) {
f22341db 261 switch (mask) {
262 case 0x0:
263 return a;
264 case 0x1:
265 return _mm_shuffle_pd(b, a, 2);
266 case 0x2:
267 return _mm_shuffle_pd(a, b, 2);
268 case 0x3:
269 return b;
270 default:
271 abort();
c017a39f 272 return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value'
f22341db 273 }
274 }
c017a39f 275 static Vc_INTRINSIC __m128 _mm_blend_ps(__m128 a, __m128 b, const int mask) {
f22341db 276 __m128i c;
277 switch (mask) {
278 case 0x0:
279 return a;
280 case 0x1:
281 c = _mm_srli_si128(_mm_setallone_si128(), 12);
282 break;
283 case 0x2:
284 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
285 break;
286 case 0x3:
287 c = _mm_srli_si128(_mm_setallone_si128(), 8);
288 break;
289 case 0x4:
290 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
291 break;
292 case 0x5:
293 c = _mm_set_epi32(0, -1, 0, -1);
294 break;
295 case 0x6:
296 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
297 break;
298 case 0x7:
299 c = _mm_srli_si128(_mm_setallone_si128(), 4);
300 break;
301 case 0x8:
302 c = _mm_slli_si128(_mm_setallone_si128(), 12);
303 break;
304 case 0x9:
305 c = _mm_set_epi32(-1, 0, 0, -1);
306 break;
307 case 0xa:
308 c = _mm_set_epi32(-1, 0, -1, 0);
309 break;
310 case 0xb:
311 c = _mm_set_epi32(-1, 0, -1, -1);
312 break;
313 case 0xc:
314 c = _mm_slli_si128(_mm_setallone_si128(), 8);
315 break;
316 case 0xd:
317 c = _mm_set_epi32(-1, -1, 0, -1);
318 break;
319 case 0xe:
320 c = _mm_slli_si128(_mm_setallone_si128(), 4);
321 break;
322 case 0xf:
323 return b;
324 default: // may not happen
325 abort();
326 c = _mm_setzero_si128();
327 break;
328 }
329 __m128 _c = _mm_castsi128_ps(c);
330 return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
331 }
c017a39f 332 static Vc_INTRINSIC __m128i _mm_blend_epi16(__m128i a, __m128i b, const int mask) {
f22341db 333 __m128i c;
334 switch (mask) {
335 case 0x00:
336 return a;
337 case 0x01:
338 c = _mm_srli_si128(_mm_setallone_si128(), 14);
339 break;
340 case 0x03:
341 c = _mm_srli_si128(_mm_setallone_si128(), 12);
342 break;
343 case 0x07:
344 c = _mm_srli_si128(_mm_setallone_si128(), 10);
345 break;
346 case 0x0f:
347 return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
348 case 0x1f:
349 c = _mm_srli_si128(_mm_setallone_si128(), 6);
350 break;
351 case 0x3f:
352 c = _mm_srli_si128(_mm_setallone_si128(), 4);
353 break;
354 case 0x7f:
355 c = _mm_srli_si128(_mm_setallone_si128(), 2);
356 break;
357 case 0x80:
358 c = _mm_slli_si128(_mm_setallone_si128(), 14);
359 break;
360 case 0xc0:
361 c = _mm_slli_si128(_mm_setallone_si128(), 12);
362 break;
363 case 0xe0:
364 c = _mm_slli_si128(_mm_setallone_si128(), 10);
365 break;
366 case 0xf0:
367 c = _mm_slli_si128(_mm_setallone_si128(), 8);
368 break;
369 case 0xf8:
370 c = _mm_slli_si128(_mm_setallone_si128(), 6);
371 break;
372 case 0xfc:
373 c = _mm_slli_si128(_mm_setallone_si128(), 4);
374 break;
375 case 0xfe:
376 c = _mm_slli_si128(_mm_setallone_si128(), 2);
377 break;
378 case 0xff:
379 return b;
380 case 0xcc:
381 return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
382 case 0x33:
383 return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
384 default:
385 const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
386 c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
387 break;
388 }
389 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
390 }
391
c017a39f 392 static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epi8 (__m128i a, __m128i b) {
f22341db 393 return _mm_blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
394 }
c017a39f 395 static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epi32(__m128i a, __m128i b) {
f22341db 396 return _mm_blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
397 }
c017a39f 398//X static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu8 (__m128i a, __m128i b) {
f22341db 399//X return _mm_blendv_epi8(b, a, _mm_cmpgt_epu8 (a, b));
400//X }
c017a39f 401 static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu16(__m128i a, __m128i b) {
f22341db 402 return _mm_blendv_epi8(b, a, _mm_cmpgt_epu16(a, b));
403 }
c017a39f 404 static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu32(__m128i a, __m128i b) {
f22341db 405 return _mm_blendv_epi8(b, a, _mm_cmpgt_epu32(a, b));
406 }
c017a39f 407//X static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu8 (__m128i a, __m128i b) {
f22341db 408//X return _mm_blendv_epi8(a, b, _mm_cmpgt_epu8 (a, b));
409//X }
c017a39f 410 static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu16(__m128i a, __m128i b) {
f22341db 411 return _mm_blendv_epi8(a, b, _mm_cmpgt_epu16(a, b));
412 }
c017a39f 413 static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu32(__m128i a, __m128i b) {
f22341db 414 return _mm_blendv_epi8(a, b, _mm_cmpgt_epu32(a, b));
415 }
c017a39f 416 static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epi8 (__m128i a, __m128i b) {
f22341db 417 return _mm_blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
418 }
c017a39f 419 static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epi32(__m128i a, __m128i b) {
f22341db 420 return _mm_blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
421 }
c017a39f 422 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu8_epi16(__m128i epu8) {
f22341db 423 return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
424 }
c017a39f 425 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi8_epi16(__m128i epi8) {
f22341db 426 return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
427 }
c017a39f 428 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu16_epi32(__m128i epu16) {
f22341db 429 return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
430 }
c017a39f 431 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi16_epi32(__m128i epu16) {
f22341db 432 return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
433 }
c017a39f 434 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu8_epi32(__m128i epu8) {
f22341db 435 return _mm_cvtepu16_epi32(_mm_cvtepu8_epi16(epu8));
436 }
c017a39f 437 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi8_epi32(__m128i epi8) {
f22341db 438 const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
439 const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
440 return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
441 }
c017a39f 442 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load_si128(__m128i *mem) {
f22341db 443 return _mm_load_si128(mem);
444 }
445
446} // namespace SSE
447} // namespace Vc
c017a39f 448} // namespace AliRoot
449#endif
450
451#ifdef VC_IMPL_POPCNT
452#include <popcntintrin.h>
f22341db 453#endif
454
455// SSE4.2
456#ifdef VC_IMPL_SSE4_2
c017a39f 457extern "C" {
f22341db 458#include <nmmintrin.h>
c017a39f 459}
f22341db 460#elif defined _NMMINTRIN_H_INCLUDED
461#error "SSE4.2 was disabled but something includes <nmmintrin.h>. Please fix your code."
462#endif
463
c017a39f 464namespace AliRoot {
f22341db 465namespace Vc
466{
467namespace SSE
468{
c017a39f 469 static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i) {
f22341db 470 float f;
471 switch (i) {
472 case 0:
473 f = _mm_cvtss_f32(v);
474 break;
475#if defined VC_IMPL_SSE4_1 && !defined VC_MSVC
476 default:
477#ifdef VC_GCC
478 f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i));
479#else
480 // MSVC fails to compile this because it can't optimize i to an immediate
481 _MM_EXTRACT_FLOAT(f, v, i);
482#endif
483 break;
484#else
485 case 1:
486 f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4)));
487 break;
488 case 2:
489 f = _mm_cvtss_f32(_mm_movehl_ps(v, v));
490 break;
491 case 3:
492 f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12)));
493 break;
494#endif
495 }
496 return f;
497 }
c017a39f 498 static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i) {
f22341db 499 if (i == 0) {
500 return _mm_cvtsd_f64(v);
501 }
502 return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v))));
503 }
c017a39f 504 static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i) {
f22341db 505#ifdef VC_GCC
506 if (__builtin_constant_p(i)) {
507 return extract_float_imm(v, i);
508//X if (index <= 1) {
509//X unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v));
510//X if (index == 0) tmp &= 0xFFFFFFFFull;
511//X if (index == 1) tmp >>= 32;
512//X return Common::AliasingEntryHelper<EntryType>(tmp);
513//X }
514 } else {
c017a39f 515 typedef float float4[4] Vc_MAY_ALIAS;
f22341db 516 const float4 &data = reinterpret_cast<const float4 &>(v);
517 return data[i];
518 }
519#else
520 union { __m128 v; float m[4]; } u;
521 u.v = v;
522 return u.m[i];
523#endif
524 }
525
c017a39f 526 static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) {
f22341db 527#ifdef VC_IMPL_SSE4_1
528 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
529#else
530 return _mm_load_ps(mem);
531#endif
532 }
c017a39f 533 static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
f22341db 534#ifdef VC_IMPL_SSE4_1
535 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
536#else
537 return _mm_load_pd(mem);
538#endif
539 }
c017a39f 540 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
f22341db 541#ifdef VC_IMPL_SSE4_1
542 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
543#else
544 return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
545#endif
546 }
c017a39f 547 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
f22341db 548 return _mm_stream_load(reinterpret_cast<const int *>(mem));
549 }
c017a39f 550 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
f22341db 551 return _mm_stream_load(reinterpret_cast<const int *>(mem));
552 }
c017a39f 553 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
f22341db 554 return _mm_stream_load(reinterpret_cast<const int *>(mem));
555 }
c017a39f 556 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
f22341db 557 return _mm_stream_load(reinterpret_cast<const int *>(mem));
558 }
c017a39f 559 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
f22341db 560 return _mm_stream_load(reinterpret_cast<const int *>(mem));
561 }
562} // namespace SSE
563} // namespace Vc
c017a39f 564} // namespace AliRoot
565
566// XOP / FMA4
567#if defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)
568extern "C" {
569#include <x86intrin.h>
570}
571#endif
f22341db 572
c017a39f 573#include "undomacros.h"
f22341db 574#include "shuffle.h"
575
576#endif // SSE_INTRINSICS_H