]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #ifndef SSE_INTRINSICS_H | |
21 | #define SSE_INTRINSICS_H | |
22 | ||
23 | #include "../common/windows_fix_intrin.h" | |
24 | ||
25 | // MMX | |
26 | #include <mmintrin.h> | |
27 | // SSE | |
28 | #include <xmmintrin.h> | |
29 | // SSE2 | |
30 | #include <emmintrin.h> | |
31 | ||
32 | #if defined(__GNUC__) && !defined(VC_IMPL_SSE2) | |
33 | #error "SSE Vector class needs at least SSE2" | |
34 | #endif | |
35 | ||
36 | #include "const_data.h" | |
37 | #include "macros.h" | |
38 | #include <cstdlib> | |
39 | ||
40 | #ifdef __3dNOW__ | |
41 | #include <mm3dnow.h> | |
42 | #endif | |
43 | ||
44 | namespace Vc | |
45 | { | |
46 | namespace SSE | |
47 | { | |
48 | enum VectorAlignmentEnum { VectorAlignment = 16 }; | |
49 | ||
50 | #if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT) | |
51 | static inline __m128i CONST _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; } | |
52 | static inline __m128i CONST _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; } | |
53 | static inline __m128i CONST _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; } | |
54 | static inline __m128i CONST _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; } | |
55 | static inline __m128i CONST _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; } | |
56 | static inline __m128i CONST _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; } | |
57 | #endif | |
58 | ||
59 | #if defined(VC_GNU_ASM) && !defined(NVALGRIND) | |
60 | static inline __m128i CONST _mm_setallone() { __m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; } | |
61 | #else | |
62 | static inline __m128i CONST _mm_setallone() { __m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); } | |
63 | #endif | |
64 | static inline __m128i CONST _mm_setallone_si128() { return _mm_setallone(); } | |
65 | static inline __m128d CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); } | |
66 | static inline __m128 CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); } | |
67 | ||
68 | static inline __m128i CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); } | |
69 | static inline __m128i CONST _mm_setone_epu8 () { return _mm_setone_epi8(); } | |
70 | static inline __m128i CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); } | |
71 | static inline __m128i CONST _mm_setone_epu16() { return _mm_setone_epi16(); } | |
72 | static inline __m128i CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); } | |
73 | static inline __m128i CONST _mm_setone_epu32() { return _mm_setone_epi32(); } | |
74 | ||
75 | static inline __m128 CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); } | |
76 | static inline __m128d CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); } | |
77 | ||
78 | static inline __m128d CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); } | |
79 | static inline __m128 CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); } | |
80 | static inline __m128d CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); } | |
81 | static inline __m128 CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); } | |
82 | ||
83 | //X static inline __m128i CONST _mm_setmin_epi8 () { return _mm_slli_epi8 (_mm_setallone_si128(), 7); } | |
84 | static inline __m128i CONST _mm_setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); } | |
85 | static inline __m128i CONST _mm_setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); } | |
86 | ||
87 | //X static inline __m128i CONST _mm_cmplt_epu8 (__m128i a, __m128i b) { return _mm_cmplt_epi8 ( | |
88 | //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); } | |
89 | //X static inline __m128i CONST _mm_cmpgt_epu8 (__m128i a, __m128i b) { return _mm_cmpgt_epi8 ( | |
90 | //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); } | |
91 | static inline __m128i CONST _mm_cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16( | |
92 | _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } | |
93 | static inline __m128i CONST _mm_cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16( | |
94 | _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } | |
95 | static inline __m128i CONST _mm_cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32( | |
96 | _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); } | |
97 | static inline __m128i CONST _mm_cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32( | |
98 | _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); } | |
99 | } // namespace SSE | |
100 | } // namespace Vc | |
101 | ||
102 | // SSE3 | |
103 | #ifdef VC_IMPL_SSE3 | |
104 | #include <pmmintrin.h> | |
105 | #elif defined _PMMINTRIN_H_INCLUDED | |
106 | #error "SSE3 was disabled but something includes <pmmintrin.h>. Please fix your code." | |
107 | #endif | |
108 | // SSSE3 | |
109 | #ifdef VC_IMPL_SSSE3 | |
110 | #include <tmmintrin.h> | |
111 | namespace Vc | |
112 | { | |
113 | namespace SSE | |
114 | { | |
115 | ||
116 | // not overriding _mm_set1_epi8 because this one should only be used for non-constants | |
117 | static inline __m128i CONST set1_epi8(int a) { | |
118 | #if defined(VC_GCC) && VC_GCC < 0x40500 | |
119 | return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128()); | |
120 | #else | |
121 | // GCC 4.5 nows about the pshufb improvement | |
122 | return _mm_set1_epi8(a); | |
123 | #endif | |
124 | } | |
125 | ||
126 | } // namespace SSE | |
127 | } // namespace Vc | |
128 | #elif defined _TMMINTRIN_H_INCLUDED | |
129 | #error "SSSE3 was disabled but something includes <tmmintrin.h>. Please fix your code." | |
130 | #else | |
131 | namespace Vc | |
132 | { | |
133 | namespace SSE | |
134 | { | |
135 | static inline __m128i CONST _mm_abs_epi8 (__m128i a) { | |
136 | __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128()); | |
137 | return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_setone_epi8())); | |
138 | } | |
139 | // positive value: | |
140 | // negative == 0 | |
141 | // a unchanged after xor | |
142 | // 0 >> 31 -> 0 | |
143 | // a + 0 -> a | |
144 | // negative value: | |
145 | // negative == -1 | |
146 | // a xor -1 -> -a - 1 | |
147 | // -1 >> 31 -> 1 | |
148 | // -a - 1 + 1 -> -a | |
149 | static inline __m128i CONST _mm_abs_epi16(__m128i a) { | |
150 | __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128()); | |
151 | return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15)); | |
152 | } | |
153 | static inline __m128i CONST _mm_abs_epi32(__m128i a) { | |
154 | __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128()); | |
155 | return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31)); | |
156 | } | |
157 | static inline __m128i CONST set1_epi8(int a) { | |
158 | return _mm_set1_epi8(a); | |
159 | } | |
160 | static inline __m128i CONST _mm_alignr_epi8(__m128i a, __m128i b, const int s) { | |
161 | switch (s) { | |
162 | case 0: return b; | |
163 | case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1)); | |
164 | case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2)); | |
165 | case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3)); | |
166 | case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4)); | |
167 | case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5)); | |
168 | case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6)); | |
169 | case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7)); | |
170 | case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8)); | |
171 | case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9)); | |
172 | case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10)); | |
173 | case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11)); | |
174 | case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12)); | |
175 | case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13)); | |
176 | case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14)); | |
177 | case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15)); | |
178 | case 16: return a; | |
179 | case 17: return _mm_srli_si128(a, 1); | |
180 | case 18: return _mm_srli_si128(a, 2); | |
181 | case 19: return _mm_srli_si128(a, 3); | |
182 | case 20: return _mm_srli_si128(a, 4); | |
183 | case 21: return _mm_srli_si128(a, 5); | |
184 | case 22: return _mm_srli_si128(a, 6); | |
185 | case 23: return _mm_srli_si128(a, 7); | |
186 | case 24: return _mm_srli_si128(a, 8); | |
187 | case 25: return _mm_srli_si128(a, 9); | |
188 | case 26: return _mm_srli_si128(a, 10); | |
189 | case 27: return _mm_srli_si128(a, 11); | |
190 | case 28: return _mm_srli_si128(a, 12); | |
191 | case 29: return _mm_srli_si128(a, 13); | |
192 | case 30: return _mm_srli_si128(a, 14); | |
193 | case 31: return _mm_srli_si128(a, 15); | |
194 | } | |
195 | return _mm_setzero_si128(); | |
196 | } | |
197 | ||
198 | } // namespace SSE | |
199 | } // namespace Vc | |
200 | ||
201 | #endif | |
202 | ||
203 | // SSE4.1 | |
204 | #ifdef VC_IMPL_SSE4_1 | |
205 | #include <smmintrin.h> | |
206 | #else | |
207 | #ifdef _SMMINTRIN_H_INCLUDED | |
208 | #error "SSE4.1 was disabled but something includes <smmintrin.h>. Please fix your code." | |
209 | #endif | |
210 | namespace Vc | |
211 | { | |
212 | namespace SSE | |
213 | { | |
214 | static inline __m128d INTRINSIC _mm_blendv_pd(__m128d a, __m128d b, __m128d c) { | |
215 | return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b)); | |
216 | } | |
217 | static inline __m128 INTRINSIC _mm_blendv_ps(__m128 a, __m128 b, __m128 c) { | |
218 | return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b)); | |
219 | } | |
220 | static inline __m128i INTRINSIC _mm_blendv_epi8(__m128i a, __m128i b, __m128i c) { | |
221 | return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); | |
222 | } | |
223 | ||
224 | // only use the following blend functions with immediates as mask and, of course, compiling | |
225 | // with optimization | |
226 | static inline __m128d INTRINSIC _mm_blend_pd(__m128d a, __m128d b, const int mask) { | |
227 | switch (mask) { | |
228 | case 0x0: | |
229 | return a; | |
230 | case 0x1: | |
231 | return _mm_shuffle_pd(b, a, 2); | |
232 | case 0x2: | |
233 | return _mm_shuffle_pd(a, b, 2); | |
234 | case 0x3: | |
235 | return b; | |
236 | default: | |
237 | abort(); | |
238 | } | |
239 | } | |
240 | static inline __m128 INTRINSIC _mm_blend_ps(__m128 a, __m128 b, const int mask) { | |
241 | __m128i c; | |
242 | switch (mask) { | |
243 | case 0x0: | |
244 | return a; | |
245 | case 0x1: | |
246 | c = _mm_srli_si128(_mm_setallone_si128(), 12); | |
247 | break; | |
248 | case 0x2: | |
249 | c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4); | |
250 | break; | |
251 | case 0x3: | |
252 | c = _mm_srli_si128(_mm_setallone_si128(), 8); | |
253 | break; | |
254 | case 0x4: | |
255 | c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8); | |
256 | break; | |
257 | case 0x5: | |
258 | c = _mm_set_epi32(0, -1, 0, -1); | |
259 | break; | |
260 | case 0x6: | |
261 | c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4); | |
262 | break; | |
263 | case 0x7: | |
264 | c = _mm_srli_si128(_mm_setallone_si128(), 4); | |
265 | break; | |
266 | case 0x8: | |
267 | c = _mm_slli_si128(_mm_setallone_si128(), 12); | |
268 | break; | |
269 | case 0x9: | |
270 | c = _mm_set_epi32(-1, 0, 0, -1); | |
271 | break; | |
272 | case 0xa: | |
273 | c = _mm_set_epi32(-1, 0, -1, 0); | |
274 | break; | |
275 | case 0xb: | |
276 | c = _mm_set_epi32(-1, 0, -1, -1); | |
277 | break; | |
278 | case 0xc: | |
279 | c = _mm_slli_si128(_mm_setallone_si128(), 8); | |
280 | break; | |
281 | case 0xd: | |
282 | c = _mm_set_epi32(-1, -1, 0, -1); | |
283 | break; | |
284 | case 0xe: | |
285 | c = _mm_slli_si128(_mm_setallone_si128(), 4); | |
286 | break; | |
287 | case 0xf: | |
288 | return b; | |
289 | default: // may not happen | |
290 | abort(); | |
291 | c = _mm_setzero_si128(); | |
292 | break; | |
293 | } | |
294 | __m128 _c = _mm_castsi128_ps(c); | |
295 | return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b)); | |
296 | } | |
297 | static inline __m128i INTRINSIC _mm_blend_epi16(__m128i a, __m128i b, const int mask) { | |
298 | __m128i c; | |
299 | switch (mask) { | |
300 | case 0x00: | |
301 | return a; | |
302 | case 0x01: | |
303 | c = _mm_srli_si128(_mm_setallone_si128(), 14); | |
304 | break; | |
305 | case 0x03: | |
306 | c = _mm_srli_si128(_mm_setallone_si128(), 12); | |
307 | break; | |
308 | case 0x07: | |
309 | c = _mm_srli_si128(_mm_setallone_si128(), 10); | |
310 | break; | |
311 | case 0x0f: | |
312 | return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a); | |
313 | case 0x1f: | |
314 | c = _mm_srli_si128(_mm_setallone_si128(), 6); | |
315 | break; | |
316 | case 0x3f: | |
317 | c = _mm_srli_si128(_mm_setallone_si128(), 4); | |
318 | break; | |
319 | case 0x7f: | |
320 | c = _mm_srli_si128(_mm_setallone_si128(), 2); | |
321 | break; | |
322 | case 0x80: | |
323 | c = _mm_slli_si128(_mm_setallone_si128(), 14); | |
324 | break; | |
325 | case 0xc0: | |
326 | c = _mm_slli_si128(_mm_setallone_si128(), 12); | |
327 | break; | |
328 | case 0xe0: | |
329 | c = _mm_slli_si128(_mm_setallone_si128(), 10); | |
330 | break; | |
331 | case 0xf0: | |
332 | c = _mm_slli_si128(_mm_setallone_si128(), 8); | |
333 | break; | |
334 | case 0xf8: | |
335 | c = _mm_slli_si128(_mm_setallone_si128(), 6); | |
336 | break; | |
337 | case 0xfc: | |
338 | c = _mm_slli_si128(_mm_setallone_si128(), 4); | |
339 | break; | |
340 | case 0xfe: | |
341 | c = _mm_slli_si128(_mm_setallone_si128(), 2); | |
342 | break; | |
343 | case 0xff: | |
344 | return b; | |
345 | case 0xcc: | |
346 | return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1))); | |
347 | case 0x33: | |
348 | return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1))); | |
349 | default: | |
350 | const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff); | |
351 | c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15); | |
352 | break; | |
353 | } | |
354 | return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); | |
355 | } | |
356 | ||
357 | static inline __m128i CONST _mm_max_epi8 (__m128i a, __m128i b) { | |
358 | return _mm_blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b)); | |
359 | } | |
360 | static inline __m128i CONST _mm_max_epi32(__m128i a, __m128i b) { | |
361 | return _mm_blendv_epi8(b, a, _mm_cmpgt_epi32(a, b)); | |
362 | } | |
363 | //X static inline __m128i CONST _mm_max_epu8 (__m128i a, __m128i b) { | |
364 | //X return _mm_blendv_epi8(b, a, _mm_cmpgt_epu8 (a, b)); | |
365 | //X } | |
366 | static inline __m128i CONST _mm_max_epu16(__m128i a, __m128i b) { | |
367 | return _mm_blendv_epi8(b, a, _mm_cmpgt_epu16(a, b)); | |
368 | } | |
369 | static inline __m128i CONST _mm_max_epu32(__m128i a, __m128i b) { | |
370 | return _mm_blendv_epi8(b, a, _mm_cmpgt_epu32(a, b)); | |
371 | } | |
372 | //X static inline __m128i CONST _mm_min_epu8 (__m128i a, __m128i b) { | |
373 | //X return _mm_blendv_epi8(a, b, _mm_cmpgt_epu8 (a, b)); | |
374 | //X } | |
375 | static inline __m128i CONST _mm_min_epu16(__m128i a, __m128i b) { | |
376 | return _mm_blendv_epi8(a, b, _mm_cmpgt_epu16(a, b)); | |
377 | } | |
378 | static inline __m128i CONST _mm_min_epu32(__m128i a, __m128i b) { | |
379 | return _mm_blendv_epi8(a, b, _mm_cmpgt_epu32(a, b)); | |
380 | } | |
381 | static inline __m128i CONST _mm_min_epi8 (__m128i a, __m128i b) { | |
382 | return _mm_blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b)); | |
383 | } | |
384 | static inline __m128i CONST _mm_min_epi32(__m128i a, __m128i b) { | |
385 | return _mm_blendv_epi8(a, b, _mm_cmpgt_epi32(a, b)); | |
386 | } | |
387 | static inline __m128i INTRINSIC _mm_cvtepu8_epi16(__m128i epu8) { | |
388 | return _mm_unpacklo_epi8(epu8, _mm_setzero_si128()); | |
389 | } | |
390 | static inline __m128i INTRINSIC _mm_cvtepi8_epi16(__m128i epi8) { | |
391 | return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128())); | |
392 | } | |
393 | static inline __m128i INTRINSIC _mm_cvtepu16_epi32(__m128i epu16) { | |
394 | return _mm_unpacklo_epi16(epu16, _mm_setzero_si128()); | |
395 | } | |
396 | static inline __m128i INTRINSIC _mm_cvtepi16_epi32(__m128i epu16) { | |
397 | return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128())); | |
398 | } | |
399 | static inline __m128i INTRINSIC _mm_cvtepu8_epi32(__m128i epu8) { | |
400 | return _mm_cvtepu16_epi32(_mm_cvtepu8_epi16(epu8)); | |
401 | } | |
402 | static inline __m128i INTRINSIC _mm_cvtepi8_epi32(__m128i epi8) { | |
403 | const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128()); | |
404 | const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg); | |
405 | return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg)); | |
406 | } | |
407 | static inline __m128i INTRINSIC _mm_stream_load_si128(__m128i *mem) { | |
408 | return _mm_load_si128(mem); | |
409 | } | |
410 | ||
411 | } // namespace SSE | |
412 | } // namespace Vc | |
413 | #endif | |
414 | ||
415 | // SSE4.2 | |
416 | #ifdef VC_IMPL_SSE4_2 | |
417 | #include <nmmintrin.h> | |
418 | #elif defined _NMMINTRIN_H_INCLUDED | |
419 | #error "SSE4.2 was disabled but something includes <nmmintrin.h>. Please fix your code." | |
420 | #endif | |
421 | ||
422 | namespace Vc | |
423 | { | |
424 | namespace SSE | |
425 | { | |
426 | static inline float INTRINSIC extract_float_imm(const __m128 v, const size_t i) { | |
427 | float f; | |
428 | switch (i) { | |
429 | case 0: | |
430 | f = _mm_cvtss_f32(v); | |
431 | break; | |
432 | #if defined VC_IMPL_SSE4_1 && !defined VC_MSVC | |
433 | default: | |
434 | #ifdef VC_GCC | |
435 | f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i)); | |
436 | #else | |
437 | // MSVC fails to compile this because it can't optimize i to an immediate | |
438 | _MM_EXTRACT_FLOAT(f, v, i); | |
439 | #endif | |
440 | break; | |
441 | #else | |
442 | case 1: | |
443 | f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4))); | |
444 | break; | |
445 | case 2: | |
446 | f = _mm_cvtss_f32(_mm_movehl_ps(v, v)); | |
447 | break; | |
448 | case 3: | |
449 | f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12))); | |
450 | break; | |
451 | #endif | |
452 | } | |
453 | return f; | |
454 | } | |
455 | static inline double INTRINSIC extract_double_imm(const __m128d v, const size_t i) { | |
456 | if (i == 0) { | |
457 | return _mm_cvtsd_f64(v); | |
458 | } | |
459 | return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v)))); | |
460 | } | |
461 | static inline float INTRINSIC extract_float(const __m128 v, const size_t i) { | |
462 | #ifdef VC_GCC | |
463 | if (__builtin_constant_p(i)) { | |
464 | return extract_float_imm(v, i); | |
465 | //X if (index <= 1) { | |
466 | //X unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v)); | |
467 | //X if (index == 0) tmp &= 0xFFFFFFFFull; | |
468 | //X if (index == 1) tmp >>= 32; | |
469 | //X return Common::AliasingEntryHelper<EntryType>(tmp); | |
470 | //X } | |
471 | } else { | |
472 | typedef float float4[4] MAY_ALIAS; | |
473 | const float4 &data = reinterpret_cast<const float4 &>(v); | |
474 | return data[i]; | |
475 | } | |
476 | #else | |
477 | union { __m128 v; float m[4]; } u; | |
478 | u.v = v; | |
479 | return u.m[i]; | |
480 | #endif | |
481 | } | |
482 | ||
483 | static inline __m128 INTRINSIC _mm_stream_load(const float *mem) { | |
484 | #ifdef VC_IMPL_SSE4_1 | |
485 | return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem)))); | |
486 | #else | |
487 | return _mm_load_ps(mem); | |
488 | #endif | |
489 | } | |
490 | static inline __m128d INTRINSIC _mm_stream_load(const double *mem) { | |
491 | #ifdef VC_IMPL_SSE4_1 | |
492 | return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem)))); | |
493 | #else | |
494 | return _mm_load_pd(mem); | |
495 | #endif | |
496 | } | |
497 | static inline __m128i INTRINSIC _mm_stream_load(const int *mem) { | |
498 | #ifdef VC_IMPL_SSE4_1 | |
499 | return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem))); | |
500 | #else | |
501 | return _mm_load_si128(reinterpret_cast<const __m128i *>(mem)); | |
502 | #endif | |
503 | } | |
504 | static inline __m128i INTRINSIC _mm_stream_load(const unsigned int *mem) { | |
505 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); | |
506 | } | |
507 | static inline __m128i INTRINSIC _mm_stream_load(const short *mem) { | |
508 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); | |
509 | } | |
510 | static inline __m128i INTRINSIC _mm_stream_load(const unsigned short *mem) { | |
511 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); | |
512 | } | |
513 | static inline __m128i INTRINSIC _mm_stream_load(const signed char *mem) { | |
514 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); | |
515 | } | |
516 | static inline __m128i INTRINSIC _mm_stream_load(const unsigned char *mem) { | |
517 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); | |
518 | } | |
519 | } // namespace SSE | |
520 | } // namespace Vc | |
521 | ||
522 | #include "shuffle.h" | |
523 | ||
524 | #endif // SSE_INTRINSICS_H |