1 /* This file is part of the Vc library.
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
20 #ifndef SSE_INTRINSICS_H
21 #define SSE_INTRINSICS_H
23 #include "../common/windows_fix_intrin.h"
25 // The GCC xxxintrin.h headers do not make sure that the intrinsics have C linkage. This not really
26 // a problem, unless there is another place where the exact same functions are declared. Then the
27 // linkage must be the same, otherwise it won't compile. Such a case occurs on Windows, where the
28 // intrin.h header (included indirectly via unistd.h) declares many SSE intrinsics again.
33 #include <xmmintrin.h>
35 #include <emmintrin.h>
38 #include "../common/fix_clang_emmintrin.h"
40 #if defined(__GNUC__) && !defined(VC_IMPL_SSE2)
41 #error "SSE Vector class needs at least SSE2"
44 #include "const_data.h"
59 enum VectorAlignmentEnum { VectorAlignment = 16 };
61 #if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT)
62 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
63 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
64 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
65 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
66 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
67 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
71 // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
72 // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
73 static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
74 static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
75 static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
76 static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
77 static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
78 static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
81 #if defined(VC_GNU_ASM) && !defined(NVALGRIND)
82 static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; }
84 static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); }
86 static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); }
87 static Vc_INTRINSIC __m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); }
88 static Vc_INTRINSIC __m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); }
90 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); }
91 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); }
92 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
93 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
94 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
95 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
97 static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
98 static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
100 static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
101 static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
102 static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
103 static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
105 //X static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi8 () { return _mm_slli_epi8 (_mm_setallone_si128(), 7); }
106 static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
107 static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
109 //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu8 (__m128i a, __m128i b) { return _mm_cmplt_epi8 (
110 //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); }
111 //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu8 (__m128i a, __m128i b) { return _mm_cmpgt_epi8 (
112 //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); }
113 static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16(
114 _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); }
115 static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(
116 _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); }
117 static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32(
118 _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); }
119 static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(
120 _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); }
123 } // namespace AliRoot
128 #include <pmmintrin.h>
130 #elif defined _PMMINTRIN_H_INCLUDED
131 #error "SSE3 was disabled but something includes <pmmintrin.h>. Please fix your code."
136 #include <tmmintrin.h>
144 // not overriding _mm_set1_epi8 because this one should only be used for non-constants
145 static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) {
146 #if defined(VC_GCC) && VC_GCC < 0x40500
147 return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128());
149 // GCC 4.5 nows about the pshufb improvement
150 return _mm_set1_epi8(a);
156 } // namespace AliRoot
157 #elif defined _TMMINTRIN_H_INCLUDED
158 #error "SSSE3 was disabled but something includes <tmmintrin.h>. Please fix your code."
165 static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi8 (__m128i a) {
166 __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
167 return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_setone_epi8()));
171 // a unchanged after xor
176 // a xor -1 -> -a - 1
179 static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi16(__m128i a) {
180 __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
181 return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
183 static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi32(__m128i a) {
184 __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
185 return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
187 static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) {
188 return _mm_set1_epi8(a);
190 static Vc_INTRINSIC __m128i Vc_CONST _mm_alignr_epi8(__m128i a, __m128i b, const int s) {
193 case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
194 case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
195 case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
196 case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
197 case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
198 case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
199 case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
200 case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
201 case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
202 case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
203 case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
204 case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
205 case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
206 case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
207 case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
209 case 17: return _mm_srli_si128(a, 1);
210 case 18: return _mm_srli_si128(a, 2);
211 case 19: return _mm_srli_si128(a, 3);
212 case 20: return _mm_srli_si128(a, 4);
213 case 21: return _mm_srli_si128(a, 5);
214 case 22: return _mm_srli_si128(a, 6);
215 case 23: return _mm_srli_si128(a, 7);
216 case 24: return _mm_srli_si128(a, 8);
217 case 25: return _mm_srli_si128(a, 9);
218 case 26: return _mm_srli_si128(a, 10);
219 case 27: return _mm_srli_si128(a, 11);
220 case 28: return _mm_srli_si128(a, 12);
221 case 29: return _mm_srli_si128(a, 13);
222 case 30: return _mm_srli_si128(a, 14);
223 case 31: return _mm_srli_si128(a, 15);
225 return _mm_setzero_si128();
230 } // namespace AliRoot
235 #ifdef VC_IMPL_SSE4_1
237 #include <smmintrin.h>
240 #ifdef _SMMINTRIN_H_INCLUDED
241 #error "SSE4.1 was disabled but something includes <smmintrin.h>. Please fix your code."
248 static Vc_INTRINSIC __m128d _mm_blendv_pd(__m128d a, __m128d b, __m128d c) {
249 return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
251 static Vc_INTRINSIC __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c) {
252 return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
254 static Vc_INTRINSIC __m128i _mm_blendv_epi8(__m128i a, __m128i b, __m128i c) {
255 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
258 // only use the following blend functions with immediates as mask and, of course, compiling
260 static Vc_INTRINSIC __m128d _mm_blend_pd(__m128d a, __m128d b, const int mask) {
265 return _mm_shuffle_pd(b, a, 2);
267 return _mm_shuffle_pd(a, b, 2);
272 return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value'
275 static Vc_INTRINSIC __m128 _mm_blend_ps(__m128 a, __m128 b, const int mask) {
281 c = _mm_srli_si128(_mm_setallone_si128(), 12);
284 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
287 c = _mm_srli_si128(_mm_setallone_si128(), 8);
290 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
293 c = _mm_set_epi32(0, -1, 0, -1);
296 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
299 c = _mm_srli_si128(_mm_setallone_si128(), 4);
302 c = _mm_slli_si128(_mm_setallone_si128(), 12);
305 c = _mm_set_epi32(-1, 0, 0, -1);
308 c = _mm_set_epi32(-1, 0, -1, 0);
311 c = _mm_set_epi32(-1, 0, -1, -1);
314 c = _mm_slli_si128(_mm_setallone_si128(), 8);
317 c = _mm_set_epi32(-1, -1, 0, -1);
320 c = _mm_slli_si128(_mm_setallone_si128(), 4);
324 default: // may not happen
326 c = _mm_setzero_si128();
329 __m128 _c = _mm_castsi128_ps(c);
330 return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
332 static Vc_INTRINSIC __m128i _mm_blend_epi16(__m128i a, __m128i b, const int mask) {
338 c = _mm_srli_si128(_mm_setallone_si128(), 14);
341 c = _mm_srli_si128(_mm_setallone_si128(), 12);
344 c = _mm_srli_si128(_mm_setallone_si128(), 10);
347 return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
349 c = _mm_srli_si128(_mm_setallone_si128(), 6);
352 c = _mm_srli_si128(_mm_setallone_si128(), 4);
355 c = _mm_srli_si128(_mm_setallone_si128(), 2);
358 c = _mm_slli_si128(_mm_setallone_si128(), 14);
361 c = _mm_slli_si128(_mm_setallone_si128(), 12);
364 c = _mm_slli_si128(_mm_setallone_si128(), 10);
367 c = _mm_slli_si128(_mm_setallone_si128(), 8);
370 c = _mm_slli_si128(_mm_setallone_si128(), 6);
373 c = _mm_slli_si128(_mm_setallone_si128(), 4);
376 c = _mm_slli_si128(_mm_setallone_si128(), 2);
381 return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
383 return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
385 const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
386 c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
389 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
392 static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epi8 (__m128i a, __m128i b) {
393 return _mm_blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
395 static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epi32(__m128i a, __m128i b) {
396 return _mm_blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
398 //X static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu8 (__m128i a, __m128i b) {
399 //X return _mm_blendv_epi8(b, a, _mm_cmpgt_epu8 (a, b));
401 static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu16(__m128i a, __m128i b) {
402 return _mm_blendv_epi8(b, a, _mm_cmpgt_epu16(a, b));
404 static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu32(__m128i a, __m128i b) {
405 return _mm_blendv_epi8(b, a, _mm_cmpgt_epu32(a, b));
407 //X static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu8 (__m128i a, __m128i b) {
408 //X return _mm_blendv_epi8(a, b, _mm_cmpgt_epu8 (a, b));
410 static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu16(__m128i a, __m128i b) {
411 return _mm_blendv_epi8(a, b, _mm_cmpgt_epu16(a, b));
413 static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu32(__m128i a, __m128i b) {
414 return _mm_blendv_epi8(a, b, _mm_cmpgt_epu32(a, b));
416 static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epi8 (__m128i a, __m128i b) {
417 return _mm_blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
419 static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epi32(__m128i a, __m128i b) {
420 return _mm_blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
422 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu8_epi16(__m128i epu8) {
423 return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
425 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi8_epi16(__m128i epi8) {
426 return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
428 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu16_epi32(__m128i epu16) {
429 return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
431 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi16_epi32(__m128i epu16) {
432 return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
434 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu8_epi32(__m128i epu8) {
435 return _mm_cvtepu16_epi32(_mm_cvtepu8_epi16(epu8));
437 static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi8_epi32(__m128i epi8) {
438 const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
439 const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
440 return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
442 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load_si128(__m128i *mem) {
443 return _mm_load_si128(mem);
448 } // namespace AliRoot
451 #ifdef VC_IMPL_POPCNT
452 #include <popcntintrin.h>
456 #ifdef VC_IMPL_SSE4_2
458 #include <nmmintrin.h>
460 #elif defined _NMMINTRIN_H_INCLUDED
461 #error "SSE4.2 was disabled but something includes <nmmintrin.h>. Please fix your code."
469 static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i) {
473 f = _mm_cvtss_f32(v);
475 #if defined VC_IMPL_SSE4_1 && !defined VC_MSVC
478 f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i));
480 // MSVC fails to compile this because it can't optimize i to an immediate
481 _MM_EXTRACT_FLOAT(f, v, i);
486 f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4)));
489 f = _mm_cvtss_f32(_mm_movehl_ps(v, v));
492 f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12)));
498 static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i) {
500 return _mm_cvtsd_f64(v);
502 return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v))));
504 static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i) {
506 if (__builtin_constant_p(i)) {
507 return extract_float_imm(v, i);
508 //X if (index <= 1) {
509 //X unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v));
510 //X if (index == 0) tmp &= 0xFFFFFFFFull;
511 //X if (index == 1) tmp >>= 32;
512 //X return Common::AliasingEntryHelper<EntryType>(tmp);
515 typedef float float4[4] Vc_MAY_ALIAS;
516 const float4 &data = reinterpret_cast<const float4 &>(v);
520 union { __m128 v; float m[4]; } u;
526 static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) {
527 #ifdef VC_IMPL_SSE4_1
528 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
530 return _mm_load_ps(mem);
533 static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
534 #ifdef VC_IMPL_SSE4_1
535 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
537 return _mm_load_pd(mem);
540 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
541 #ifdef VC_IMPL_SSE4_1
542 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
544 return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
547 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
548 return _mm_stream_load(reinterpret_cast<const int *>(mem));
550 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
551 return _mm_stream_load(reinterpret_cast<const int *>(mem));
553 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
554 return _mm_stream_load(reinterpret_cast<const int *>(mem));
556 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
557 return _mm_stream_load(reinterpret_cast<const int *>(mem));
559 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
560 return _mm_stream_load(reinterpret_cast<const int *>(mem));
564 } // namespace AliRoot
567 #if defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)
569 #include <x86intrin.h>
573 #include "undomacros.h"
576 #endif // SSE_INTRINSICS_H