]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/include/Vc/sse/intrinsics.h
Vc package added (version 0.6.79-dev)
[u/mrichter/AliRoot.git] / Vc / include / Vc / sse / intrinsics.h
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
20#ifndef SSE_INTRINSICS_H
21#define SSE_INTRINSICS_H
22
23#include "../common/windows_fix_intrin.h"
24
25// MMX
26#include <mmintrin.h>
27// SSE
28#include <xmmintrin.h>
29// SSE2
30#include <emmintrin.h>
31
32#if defined(__GNUC__) && !defined(VC_IMPL_SSE2)
33#error "SSE Vector class needs at least SSE2"
34#endif
35
36#include "const_data.h"
37#include "macros.h"
38#include <cstdlib>
39
40#ifdef __3dNOW__
41#include <mm3dnow.h>
42#endif
43
44namespace Vc
45{
46namespace SSE
47{
48 enum VectorAlignmentEnum { VectorAlignment = 16 };
49
50#if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT)
51 static inline __m128i CONST _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
52 static inline __m128i CONST _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
53 static inline __m128i CONST _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
54 static inline __m128i CONST _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
55 static inline __m128i CONST _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
56 static inline __m128i CONST _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
57#endif
58
59#if defined(VC_GNU_ASM) && !defined(NVALGRIND)
60 static inline __m128i CONST _mm_setallone() { __m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; }
61#else
62 static inline __m128i CONST _mm_setallone() { __m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); }
63#endif
64 static inline __m128i CONST _mm_setallone_si128() { return _mm_setallone(); }
65 static inline __m128d CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); }
66 static inline __m128 CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); }
67
68 static inline __m128i CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); }
69 static inline __m128i CONST _mm_setone_epu8 () { return _mm_setone_epi8(); }
70 static inline __m128i CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
71 static inline __m128i CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
72 static inline __m128i CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
73 static inline __m128i CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
74
75 static inline __m128 CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
76 static inline __m128d CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
77
78 static inline __m128d CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
79 static inline __m128 CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
80 static inline __m128d CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
81 static inline __m128 CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
82
83 //X static inline __m128i CONST _mm_setmin_epi8 () { return _mm_slli_epi8 (_mm_setallone_si128(), 7); }
84 static inline __m128i CONST _mm_setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
85 static inline __m128i CONST _mm_setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
86
87 //X static inline __m128i CONST _mm_cmplt_epu8 (__m128i a, __m128i b) { return _mm_cmplt_epi8 (
88 //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); }
89 //X static inline __m128i CONST _mm_cmpgt_epu8 (__m128i a, __m128i b) { return _mm_cmpgt_epi8 (
90 //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); }
91 static inline __m128i CONST _mm_cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16(
92 _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); }
93 static inline __m128i CONST _mm_cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(
94 _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); }
95 static inline __m128i CONST _mm_cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32(
96 _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); }
97 static inline __m128i CONST _mm_cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(
98 _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); }
99} // namespace SSE
100} // namespace Vc
101
102// SSE3
103#ifdef VC_IMPL_SSE3
104#include <pmmintrin.h>
105#elif defined _PMMINTRIN_H_INCLUDED
106#error "SSE3 was disabled but something includes <pmmintrin.h>. Please fix your code."
107#endif
108// SSSE3
109#ifdef VC_IMPL_SSSE3
110#include <tmmintrin.h>
111namespace Vc
112{
113namespace SSE
114{
115
116 // not overriding _mm_set1_epi8 because this one should only be used for non-constants
117 static inline __m128i CONST set1_epi8(int a) {
118#if defined(VC_GCC) && VC_GCC < 0x40500
119 return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128());
120#else
121 // GCC 4.5 nows about the pshufb improvement
122 return _mm_set1_epi8(a);
123#endif
124 }
125
126} // namespace SSE
127} // namespace Vc
128#elif defined _TMMINTRIN_H_INCLUDED
129#error "SSSE3 was disabled but something includes <tmmintrin.h>. Please fix your code."
130#else
131namespace Vc
132{
133namespace SSE
134{
135 static inline __m128i CONST _mm_abs_epi8 (__m128i a) {
136 __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
137 return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_setone_epi8()));
138 }
139 // positive value:
140 // negative == 0
141 // a unchanged after xor
142 // 0 >> 31 -> 0
143 // a + 0 -> a
144 // negative value:
145 // negative == -1
146 // a xor -1 -> -a - 1
147 // -1 >> 31 -> 1
148 // -a - 1 + 1 -> -a
149 static inline __m128i CONST _mm_abs_epi16(__m128i a) {
150 __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
151 return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
152 }
153 static inline __m128i CONST _mm_abs_epi32(__m128i a) {
154 __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
155 return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
156 }
157 static inline __m128i CONST set1_epi8(int a) {
158 return _mm_set1_epi8(a);
159 }
160 static inline __m128i CONST _mm_alignr_epi8(__m128i a, __m128i b, const int s) {
161 switch (s) {
162 case 0: return b;
163 case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
164 case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
165 case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
166 case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
167 case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
168 case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
169 case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
170 case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
171 case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
172 case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
173 case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
174 case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
175 case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
176 case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
177 case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
178 case 16: return a;
179 case 17: return _mm_srli_si128(a, 1);
180 case 18: return _mm_srli_si128(a, 2);
181 case 19: return _mm_srli_si128(a, 3);
182 case 20: return _mm_srli_si128(a, 4);
183 case 21: return _mm_srli_si128(a, 5);
184 case 22: return _mm_srli_si128(a, 6);
185 case 23: return _mm_srli_si128(a, 7);
186 case 24: return _mm_srli_si128(a, 8);
187 case 25: return _mm_srli_si128(a, 9);
188 case 26: return _mm_srli_si128(a, 10);
189 case 27: return _mm_srli_si128(a, 11);
190 case 28: return _mm_srli_si128(a, 12);
191 case 29: return _mm_srli_si128(a, 13);
192 case 30: return _mm_srli_si128(a, 14);
193 case 31: return _mm_srli_si128(a, 15);
194 }
195 return _mm_setzero_si128();
196 }
197
198} // namespace SSE
199} // namespace Vc
200
201#endif
202
203// SSE4.1
204#ifdef VC_IMPL_SSE4_1
205#include <smmintrin.h>
206#else
207#ifdef _SMMINTRIN_H_INCLUDED
208#error "SSE4.1 was disabled but something includes <smmintrin.h>. Please fix your code."
209#endif
210namespace Vc
211{
212namespace SSE
213{
214 static inline __m128d INTRINSIC _mm_blendv_pd(__m128d a, __m128d b, __m128d c) {
215 return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
216 }
217 static inline __m128 INTRINSIC _mm_blendv_ps(__m128 a, __m128 b, __m128 c) {
218 return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
219 }
220 static inline __m128i INTRINSIC _mm_blendv_epi8(__m128i a, __m128i b, __m128i c) {
221 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
222 }
223
224 // only use the following blend functions with immediates as mask and, of course, compiling
225 // with optimization
226 static inline __m128d INTRINSIC _mm_blend_pd(__m128d a, __m128d b, const int mask) {
227 switch (mask) {
228 case 0x0:
229 return a;
230 case 0x1:
231 return _mm_shuffle_pd(b, a, 2);
232 case 0x2:
233 return _mm_shuffle_pd(a, b, 2);
234 case 0x3:
235 return b;
236 default:
237 abort();
238 }
239 }
240 static inline __m128 INTRINSIC _mm_blend_ps(__m128 a, __m128 b, const int mask) {
241 __m128i c;
242 switch (mask) {
243 case 0x0:
244 return a;
245 case 0x1:
246 c = _mm_srli_si128(_mm_setallone_si128(), 12);
247 break;
248 case 0x2:
249 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
250 break;
251 case 0x3:
252 c = _mm_srli_si128(_mm_setallone_si128(), 8);
253 break;
254 case 0x4:
255 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
256 break;
257 case 0x5:
258 c = _mm_set_epi32(0, -1, 0, -1);
259 break;
260 case 0x6:
261 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
262 break;
263 case 0x7:
264 c = _mm_srli_si128(_mm_setallone_si128(), 4);
265 break;
266 case 0x8:
267 c = _mm_slli_si128(_mm_setallone_si128(), 12);
268 break;
269 case 0x9:
270 c = _mm_set_epi32(-1, 0, 0, -1);
271 break;
272 case 0xa:
273 c = _mm_set_epi32(-1, 0, -1, 0);
274 break;
275 case 0xb:
276 c = _mm_set_epi32(-1, 0, -1, -1);
277 break;
278 case 0xc:
279 c = _mm_slli_si128(_mm_setallone_si128(), 8);
280 break;
281 case 0xd:
282 c = _mm_set_epi32(-1, -1, 0, -1);
283 break;
284 case 0xe:
285 c = _mm_slli_si128(_mm_setallone_si128(), 4);
286 break;
287 case 0xf:
288 return b;
289 default: // may not happen
290 abort();
291 c = _mm_setzero_si128();
292 break;
293 }
294 __m128 _c = _mm_castsi128_ps(c);
295 return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
296 }
297 static inline __m128i INTRINSIC _mm_blend_epi16(__m128i a, __m128i b, const int mask) {
298 __m128i c;
299 switch (mask) {
300 case 0x00:
301 return a;
302 case 0x01:
303 c = _mm_srli_si128(_mm_setallone_si128(), 14);
304 break;
305 case 0x03:
306 c = _mm_srli_si128(_mm_setallone_si128(), 12);
307 break;
308 case 0x07:
309 c = _mm_srli_si128(_mm_setallone_si128(), 10);
310 break;
311 case 0x0f:
312 return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
313 case 0x1f:
314 c = _mm_srli_si128(_mm_setallone_si128(), 6);
315 break;
316 case 0x3f:
317 c = _mm_srli_si128(_mm_setallone_si128(), 4);
318 break;
319 case 0x7f:
320 c = _mm_srli_si128(_mm_setallone_si128(), 2);
321 break;
322 case 0x80:
323 c = _mm_slli_si128(_mm_setallone_si128(), 14);
324 break;
325 case 0xc0:
326 c = _mm_slli_si128(_mm_setallone_si128(), 12);
327 break;
328 case 0xe0:
329 c = _mm_slli_si128(_mm_setallone_si128(), 10);
330 break;
331 case 0xf0:
332 c = _mm_slli_si128(_mm_setallone_si128(), 8);
333 break;
334 case 0xf8:
335 c = _mm_slli_si128(_mm_setallone_si128(), 6);
336 break;
337 case 0xfc:
338 c = _mm_slli_si128(_mm_setallone_si128(), 4);
339 break;
340 case 0xfe:
341 c = _mm_slli_si128(_mm_setallone_si128(), 2);
342 break;
343 case 0xff:
344 return b;
345 case 0xcc:
346 return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
347 case 0x33:
348 return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
349 default:
350 const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
351 c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
352 break;
353 }
354 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
355 }
356
357 static inline __m128i CONST _mm_max_epi8 (__m128i a, __m128i b) {
358 return _mm_blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
359 }
360 static inline __m128i CONST _mm_max_epi32(__m128i a, __m128i b) {
361 return _mm_blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
362 }
363//X static inline __m128i CONST _mm_max_epu8 (__m128i a, __m128i b) {
364//X return _mm_blendv_epi8(b, a, _mm_cmpgt_epu8 (a, b));
365//X }
366 static inline __m128i CONST _mm_max_epu16(__m128i a, __m128i b) {
367 return _mm_blendv_epi8(b, a, _mm_cmpgt_epu16(a, b));
368 }
369 static inline __m128i CONST _mm_max_epu32(__m128i a, __m128i b) {
370 return _mm_blendv_epi8(b, a, _mm_cmpgt_epu32(a, b));
371 }
372//X static inline __m128i CONST _mm_min_epu8 (__m128i a, __m128i b) {
373//X return _mm_blendv_epi8(a, b, _mm_cmpgt_epu8 (a, b));
374//X }
375 static inline __m128i CONST _mm_min_epu16(__m128i a, __m128i b) {
376 return _mm_blendv_epi8(a, b, _mm_cmpgt_epu16(a, b));
377 }
378 static inline __m128i CONST _mm_min_epu32(__m128i a, __m128i b) {
379 return _mm_blendv_epi8(a, b, _mm_cmpgt_epu32(a, b));
380 }
381 static inline __m128i CONST _mm_min_epi8 (__m128i a, __m128i b) {
382 return _mm_blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
383 }
384 static inline __m128i CONST _mm_min_epi32(__m128i a, __m128i b) {
385 return _mm_blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
386 }
387 static inline __m128i INTRINSIC _mm_cvtepu8_epi16(__m128i epu8) {
388 return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
389 }
390 static inline __m128i INTRINSIC _mm_cvtepi8_epi16(__m128i epi8) {
391 return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
392 }
393 static inline __m128i INTRINSIC _mm_cvtepu16_epi32(__m128i epu16) {
394 return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
395 }
396 static inline __m128i INTRINSIC _mm_cvtepi16_epi32(__m128i epu16) {
397 return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
398 }
399 static inline __m128i INTRINSIC _mm_cvtepu8_epi32(__m128i epu8) {
400 return _mm_cvtepu16_epi32(_mm_cvtepu8_epi16(epu8));
401 }
402 static inline __m128i INTRINSIC _mm_cvtepi8_epi32(__m128i epi8) {
403 const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
404 const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
405 return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
406 }
407 static inline __m128i INTRINSIC _mm_stream_load_si128(__m128i *mem) {
408 return _mm_load_si128(mem);
409 }
410
411} // namespace SSE
412} // namespace Vc
413#endif
414
415// SSE4.2
416#ifdef VC_IMPL_SSE4_2
417#include <nmmintrin.h>
418#elif defined _NMMINTRIN_H_INCLUDED
419#error "SSE4.2 was disabled but something includes <nmmintrin.h>. Please fix your code."
420#endif
421
422namespace Vc
423{
424namespace SSE
425{
426 static inline float INTRINSIC extract_float_imm(const __m128 v, const size_t i) {
427 float f;
428 switch (i) {
429 case 0:
430 f = _mm_cvtss_f32(v);
431 break;
432#if defined VC_IMPL_SSE4_1 && !defined VC_MSVC
433 default:
434#ifdef VC_GCC
435 f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i));
436#else
437 // MSVC fails to compile this because it can't optimize i to an immediate
438 _MM_EXTRACT_FLOAT(f, v, i);
439#endif
440 break;
441#else
442 case 1:
443 f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4)));
444 break;
445 case 2:
446 f = _mm_cvtss_f32(_mm_movehl_ps(v, v));
447 break;
448 case 3:
449 f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12)));
450 break;
451#endif
452 }
453 return f;
454 }
455 static inline double INTRINSIC extract_double_imm(const __m128d v, const size_t i) {
456 if (i == 0) {
457 return _mm_cvtsd_f64(v);
458 }
459 return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v))));
460 }
461 static inline float INTRINSIC extract_float(const __m128 v, const size_t i) {
462#ifdef VC_GCC
463 if (__builtin_constant_p(i)) {
464 return extract_float_imm(v, i);
465//X if (index <= 1) {
466//X unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v));
467//X if (index == 0) tmp &= 0xFFFFFFFFull;
468//X if (index == 1) tmp >>= 32;
469//X return Common::AliasingEntryHelper<EntryType>(tmp);
470//X }
471 } else {
472 typedef float float4[4] MAY_ALIAS;
473 const float4 &data = reinterpret_cast<const float4 &>(v);
474 return data[i];
475 }
476#else
477 union { __m128 v; float m[4]; } u;
478 u.v = v;
479 return u.m[i];
480#endif
481 }
482
483 static inline __m128 INTRINSIC _mm_stream_load(const float *mem) {
484#ifdef VC_IMPL_SSE4_1
485 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
486#else
487 return _mm_load_ps(mem);
488#endif
489 }
490 static inline __m128d INTRINSIC _mm_stream_load(const double *mem) {
491#ifdef VC_IMPL_SSE4_1
492 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
493#else
494 return _mm_load_pd(mem);
495#endif
496 }
497 static inline __m128i INTRINSIC _mm_stream_load(const int *mem) {
498#ifdef VC_IMPL_SSE4_1
499 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
500#else
501 return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
502#endif
503 }
504 static inline __m128i INTRINSIC _mm_stream_load(const unsigned int *mem) {
505 return _mm_stream_load(reinterpret_cast<const int *>(mem));
506 }
507 static inline __m128i INTRINSIC _mm_stream_load(const short *mem) {
508 return _mm_stream_load(reinterpret_cast<const int *>(mem));
509 }
510 static inline __m128i INTRINSIC _mm_stream_load(const unsigned short *mem) {
511 return _mm_stream_load(reinterpret_cast<const int *>(mem));
512 }
513 static inline __m128i INTRINSIC _mm_stream_load(const signed char *mem) {
514 return _mm_stream_load(reinterpret_cast<const int *>(mem));
515 }
516 static inline __m128i INTRINSIC _mm_stream_load(const unsigned char *mem) {
517 return _mm_stream_load(reinterpret_cast<const int *>(mem));
518 }
519} // namespace SSE
520} // namespace Vc
521
522#include "shuffle.h"
523
524#endif // SSE_INTRINSICS_H