]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #ifndef SSE_INTRINSICS_H | |
21 | #define SSE_INTRINSICS_H | |
22 | ||
23 | #include "../common/windows_fix_intrin.h" | |
24 | ||
c017a39f | 25 | // The GCC xxxintrin.h headers do not make sure that the intrinsics have C linkage. This not really |
26 | // a problem, unless there is another place where the exact same functions are declared. Then the | |
27 | // linkage must be the same, otherwise it won't compile. Such a case occurs on Windows, where the | |
28 | // intrin.h header (included indirectly via unistd.h) declares many SSE intrinsics again. | |
29 | extern "C" { | |
f22341db | 30 | // MMX |
31 | #include <mmintrin.h> | |
32 | // SSE | |
33 | #include <xmmintrin.h> | |
34 | // SSE2 | |
35 | #include <emmintrin.h> | |
c017a39f | 36 | } |
37 | ||
38 | #include "../common/fix_clang_emmintrin.h" | |
f22341db | 39 | |
40 | #if defined(__GNUC__) && !defined(VC_IMPL_SSE2) | |
41 | #error "SSE Vector class needs at least SSE2" | |
42 | #endif | |
43 | ||
44 | #include "const_data.h" | |
f22341db | 45 | #include <cstdlib> |
c017a39f | 46 | #include "macros.h" |
f22341db | 47 | |
48 | #ifdef __3dNOW__ | |
c017a39f | 49 | extern "C" { |
f22341db | 50 | #include <mm3dnow.h> |
c017a39f | 51 | } |
f22341db | 52 | #endif |
53 | ||
c017a39f | 54 | namespace AliRoot { |
f22341db | 55 | namespace Vc |
56 | { | |
57 | namespace SSE | |
58 | { | |
59 | enum VectorAlignmentEnum { VectorAlignment = 16 }; | |
60 | ||
61 | #if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT) | |
c017a39f | 62 | static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; } |
63 | static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; } | |
64 | static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; } | |
65 | static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; } | |
66 | static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; } | |
67 | static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; } | |
68 | #endif | |
69 | ||
70 | #ifdef VC_GCC | |
71 | // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin | |
72 | // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :) | |
73 | static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); } | |
74 | static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); } | |
75 | static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); } | |
76 | static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); } | |
77 | static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); } | |
78 | static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); } | |
f22341db | 79 | #endif |
80 | ||
81 | #if defined(VC_GNU_ASM) && !defined(NVALGRIND) | |
c017a39f | 82 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; } |
f22341db | 83 | #else |
c017a39f | 84 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); } |
f22341db | 85 | #endif |
c017a39f | 86 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); } |
87 | static Vc_INTRINSIC __m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); } | |
88 | static Vc_INTRINSIC __m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); } | |
89 | ||
90 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); } | |
91 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); } | |
92 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); } | |
93 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } | |
94 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); } | |
95 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); } | |
96 | ||
97 | static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); } | |
98 | static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); } | |
99 | ||
100 | static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); } | |
101 | static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); } | |
102 | static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); } | |
103 | static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); } | |
104 | ||
105 | //X static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi8 () { return _mm_slli_epi8 (_mm_setallone_si128(), 7); } | |
106 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); } | |
107 | static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); } | |
108 | ||
109 | //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu8 (__m128i a, __m128i b) { return _mm_cmplt_epi8 ( | |
f22341db | 110 | //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); } |
c017a39f | 111 | //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu8 (__m128i a, __m128i b) { return _mm_cmpgt_epi8 ( |
f22341db | 112 | //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); } |
c017a39f | 113 | static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16( |
f22341db | 114 | _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } |
c017a39f | 115 | static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16( |
f22341db | 116 | _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } |
c017a39f | 117 | static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32( |
f22341db | 118 | _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); } |
c017a39f | 119 | static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32( |
f22341db | 120 | _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); } |
121 | } // namespace SSE | |
122 | } // namespace Vc | |
c017a39f | 123 | } // namespace AliRoot |
f22341db | 124 | |
125 | // SSE3 | |
126 | #ifdef VC_IMPL_SSE3 | |
c017a39f | 127 | extern "C" { |
f22341db | 128 | #include <pmmintrin.h> |
c017a39f | 129 | } |
f22341db | 130 | #elif defined _PMMINTRIN_H_INCLUDED |
131 | #error "SSE3 was disabled but something includes <pmmintrin.h>. Please fix your code." | |
132 | #endif | |
133 | // SSSE3 | |
134 | #ifdef VC_IMPL_SSSE3 | |
c017a39f | 135 | extern "C" { |
f22341db | 136 | #include <tmmintrin.h> |
c017a39f | 137 | } |
138 | namespace AliRoot { | |
f22341db | 139 | namespace Vc |
140 | { | |
141 | namespace SSE | |
142 | { | |
143 | ||
144 | // not overriding _mm_set1_epi8 because this one should only be used for non-constants | |
c017a39f | 145 | static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) { |
f22341db | 146 | #if defined(VC_GCC) && VC_GCC < 0x40500 |
147 | return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128()); | |
148 | #else | |
149 | // GCC 4.5 nows about the pshufb improvement | |
150 | return _mm_set1_epi8(a); | |
151 | #endif | |
152 | } | |
153 | ||
154 | } // namespace SSE | |
155 | } // namespace Vc | |
c017a39f | 156 | } // namespace AliRoot |
f22341db | 157 | #elif defined _TMMINTRIN_H_INCLUDED |
158 | #error "SSSE3 was disabled but something includes <tmmintrin.h>. Please fix your code." | |
159 | #else | |
c017a39f | 160 | namespace AliRoot { |
f22341db | 161 | namespace Vc |
162 | { | |
163 | namespace SSE | |
164 | { | |
c017a39f | 165 | static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi8 (__m128i a) { |
f22341db | 166 | __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128()); |
167 | return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_setone_epi8())); | |
168 | } | |
169 | // positive value: | |
170 | // negative == 0 | |
171 | // a unchanged after xor | |
172 | // 0 >> 31 -> 0 | |
173 | // a + 0 -> a | |
174 | // negative value: | |
175 | // negative == -1 | |
176 | // a xor -1 -> -a - 1 | |
177 | // -1 >> 31 -> 1 | |
178 | // -a - 1 + 1 -> -a | |
c017a39f | 179 | static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi16(__m128i a) { |
f22341db | 180 | __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128()); |
181 | return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15)); | |
182 | } | |
c017a39f | 183 | static Vc_INTRINSIC __m128i Vc_CONST _mm_abs_epi32(__m128i a) { |
f22341db | 184 | __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128()); |
185 | return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31)); | |
186 | } | |
c017a39f | 187 | static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) { |
f22341db | 188 | return _mm_set1_epi8(a); |
189 | } | |
c017a39f | 190 | static Vc_INTRINSIC __m128i Vc_CONST _mm_alignr_epi8(__m128i a, __m128i b, const int s) { |
f22341db | 191 | switch (s) { |
192 | case 0: return b; | |
193 | case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1)); | |
194 | case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2)); | |
195 | case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3)); | |
196 | case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4)); | |
197 | case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5)); | |
198 | case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6)); | |
199 | case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7)); | |
200 | case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8)); | |
201 | case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9)); | |
202 | case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10)); | |
203 | case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11)); | |
204 | case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12)); | |
205 | case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13)); | |
206 | case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14)); | |
207 | case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15)); | |
208 | case 16: return a; | |
209 | case 17: return _mm_srli_si128(a, 1); | |
210 | case 18: return _mm_srli_si128(a, 2); | |
211 | case 19: return _mm_srli_si128(a, 3); | |
212 | case 20: return _mm_srli_si128(a, 4); | |
213 | case 21: return _mm_srli_si128(a, 5); | |
214 | case 22: return _mm_srli_si128(a, 6); | |
215 | case 23: return _mm_srli_si128(a, 7); | |
216 | case 24: return _mm_srli_si128(a, 8); | |
217 | case 25: return _mm_srli_si128(a, 9); | |
218 | case 26: return _mm_srli_si128(a, 10); | |
219 | case 27: return _mm_srli_si128(a, 11); | |
220 | case 28: return _mm_srli_si128(a, 12); | |
221 | case 29: return _mm_srli_si128(a, 13); | |
222 | case 30: return _mm_srli_si128(a, 14); | |
223 | case 31: return _mm_srli_si128(a, 15); | |
224 | } | |
225 | return _mm_setzero_si128(); | |
226 | } | |
227 | ||
228 | } // namespace SSE | |
229 | } // namespace Vc | |
c017a39f | 230 | } // namespace AliRoot |
f22341db | 231 | |
232 | #endif | |
233 | ||
234 | // SSE4.1 | |
235 | #ifdef VC_IMPL_SSE4_1 | |
c017a39f | 236 | extern "C" { |
f22341db | 237 | #include <smmintrin.h> |
c017a39f | 238 | } |
f22341db | 239 | #else |
240 | #ifdef _SMMINTRIN_H_INCLUDED | |
241 | #error "SSE4.1 was disabled but something includes <smmintrin.h>. Please fix your code." | |
242 | #endif | |
c017a39f | 243 | namespace AliRoot { |
f22341db | 244 | namespace Vc |
245 | { | |
246 | namespace SSE | |
247 | { | |
c017a39f | 248 | static Vc_INTRINSIC __m128d _mm_blendv_pd(__m128d a, __m128d b, __m128d c) { |
f22341db | 249 | return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b)); |
250 | } | |
c017a39f | 251 | static Vc_INTRINSIC __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c) { |
f22341db | 252 | return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b)); |
253 | } | |
c017a39f | 254 | static Vc_INTRINSIC __m128i _mm_blendv_epi8(__m128i a, __m128i b, __m128i c) { |
f22341db | 255 | return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); |
256 | } | |
257 | ||
258 | // only use the following blend functions with immediates as mask and, of course, compiling | |
259 | // with optimization | |
c017a39f | 260 | static Vc_INTRINSIC __m128d _mm_blend_pd(__m128d a, __m128d b, const int mask) { |
f22341db | 261 | switch (mask) { |
262 | case 0x0: | |
263 | return a; | |
264 | case 0x1: | |
265 | return _mm_shuffle_pd(b, a, 2); | |
266 | case 0x2: | |
267 | return _mm_shuffle_pd(a, b, 2); | |
268 | case 0x3: | |
269 | return b; | |
270 | default: | |
271 | abort(); | |
c017a39f | 272 | return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value' |
f22341db | 273 | } |
274 | } | |
c017a39f | 275 | static Vc_INTRINSIC __m128 _mm_blend_ps(__m128 a, __m128 b, const int mask) { |
f22341db | 276 | __m128i c; |
277 | switch (mask) { | |
278 | case 0x0: | |
279 | return a; | |
280 | case 0x1: | |
281 | c = _mm_srli_si128(_mm_setallone_si128(), 12); | |
282 | break; | |
283 | case 0x2: | |
284 | c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4); | |
285 | break; | |
286 | case 0x3: | |
287 | c = _mm_srli_si128(_mm_setallone_si128(), 8); | |
288 | break; | |
289 | case 0x4: | |
290 | c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8); | |
291 | break; | |
292 | case 0x5: | |
293 | c = _mm_set_epi32(0, -1, 0, -1); | |
294 | break; | |
295 | case 0x6: | |
296 | c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4); | |
297 | break; | |
298 | case 0x7: | |
299 | c = _mm_srli_si128(_mm_setallone_si128(), 4); | |
300 | break; | |
301 | case 0x8: | |
302 | c = _mm_slli_si128(_mm_setallone_si128(), 12); | |
303 | break; | |
304 | case 0x9: | |
305 | c = _mm_set_epi32(-1, 0, 0, -1); | |
306 | break; | |
307 | case 0xa: | |
308 | c = _mm_set_epi32(-1, 0, -1, 0); | |
309 | break; | |
310 | case 0xb: | |
311 | c = _mm_set_epi32(-1, 0, -1, -1); | |
312 | break; | |
313 | case 0xc: | |
314 | c = _mm_slli_si128(_mm_setallone_si128(), 8); | |
315 | break; | |
316 | case 0xd: | |
317 | c = _mm_set_epi32(-1, -1, 0, -1); | |
318 | break; | |
319 | case 0xe: | |
320 | c = _mm_slli_si128(_mm_setallone_si128(), 4); | |
321 | break; | |
322 | case 0xf: | |
323 | return b; | |
324 | default: // may not happen | |
325 | abort(); | |
326 | c = _mm_setzero_si128(); | |
327 | break; | |
328 | } | |
329 | __m128 _c = _mm_castsi128_ps(c); | |
330 | return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b)); | |
331 | } | |
c017a39f | 332 | static Vc_INTRINSIC __m128i _mm_blend_epi16(__m128i a, __m128i b, const int mask) { |
f22341db | 333 | __m128i c; |
334 | switch (mask) { | |
335 | case 0x00: | |
336 | return a; | |
337 | case 0x01: | |
338 | c = _mm_srli_si128(_mm_setallone_si128(), 14); | |
339 | break; | |
340 | case 0x03: | |
341 | c = _mm_srli_si128(_mm_setallone_si128(), 12); | |
342 | break; | |
343 | case 0x07: | |
344 | c = _mm_srli_si128(_mm_setallone_si128(), 10); | |
345 | break; | |
346 | case 0x0f: | |
347 | return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a); | |
348 | case 0x1f: | |
349 | c = _mm_srli_si128(_mm_setallone_si128(), 6); | |
350 | break; | |
351 | case 0x3f: | |
352 | c = _mm_srli_si128(_mm_setallone_si128(), 4); | |
353 | break; | |
354 | case 0x7f: | |
355 | c = _mm_srli_si128(_mm_setallone_si128(), 2); | |
356 | break; | |
357 | case 0x80: | |
358 | c = _mm_slli_si128(_mm_setallone_si128(), 14); | |
359 | break; | |
360 | case 0xc0: | |
361 | c = _mm_slli_si128(_mm_setallone_si128(), 12); | |
362 | break; | |
363 | case 0xe0: | |
364 | c = _mm_slli_si128(_mm_setallone_si128(), 10); | |
365 | break; | |
366 | case 0xf0: | |
367 | c = _mm_slli_si128(_mm_setallone_si128(), 8); | |
368 | break; | |
369 | case 0xf8: | |
370 | c = _mm_slli_si128(_mm_setallone_si128(), 6); | |
371 | break; | |
372 | case 0xfc: | |
373 | c = _mm_slli_si128(_mm_setallone_si128(), 4); | |
374 | break; | |
375 | case 0xfe: | |
376 | c = _mm_slli_si128(_mm_setallone_si128(), 2); | |
377 | break; | |
378 | case 0xff: | |
379 | return b; | |
380 | case 0xcc: | |
381 | return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1))); | |
382 | case 0x33: | |
383 | return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1))); | |
384 | default: | |
385 | const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff); | |
386 | c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15); | |
387 | break; | |
388 | } | |
389 | return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); | |
390 | } | |
391 | ||
c017a39f | 392 | static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epi8 (__m128i a, __m128i b) { |
f22341db | 393 | return _mm_blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b)); |
394 | } | |
c017a39f | 395 | static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epi32(__m128i a, __m128i b) { |
f22341db | 396 | return _mm_blendv_epi8(b, a, _mm_cmpgt_epi32(a, b)); |
397 | } | |
c017a39f | 398 | //X static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu8 (__m128i a, __m128i b) { |
f22341db | 399 | //X return _mm_blendv_epi8(b, a, _mm_cmpgt_epu8 (a, b)); |
400 | //X } | |
c017a39f | 401 | static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu16(__m128i a, __m128i b) { |
f22341db | 402 | return _mm_blendv_epi8(b, a, _mm_cmpgt_epu16(a, b)); |
403 | } | |
c017a39f | 404 | static Vc_INTRINSIC __m128i Vc_CONST _mm_max_epu32(__m128i a, __m128i b) { |
f22341db | 405 | return _mm_blendv_epi8(b, a, _mm_cmpgt_epu32(a, b)); |
406 | } | |
c017a39f | 407 | //X static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu8 (__m128i a, __m128i b) { |
f22341db | 408 | //X return _mm_blendv_epi8(a, b, _mm_cmpgt_epu8 (a, b)); |
409 | //X } | |
c017a39f | 410 | static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu16(__m128i a, __m128i b) { |
f22341db | 411 | return _mm_blendv_epi8(a, b, _mm_cmpgt_epu16(a, b)); |
412 | } | |
c017a39f | 413 | static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epu32(__m128i a, __m128i b) { |
f22341db | 414 | return _mm_blendv_epi8(a, b, _mm_cmpgt_epu32(a, b)); |
415 | } | |
c017a39f | 416 | static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epi8 (__m128i a, __m128i b) { |
f22341db | 417 | return _mm_blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b)); |
418 | } | |
c017a39f | 419 | static Vc_INTRINSIC __m128i Vc_CONST _mm_min_epi32(__m128i a, __m128i b) { |
f22341db | 420 | return _mm_blendv_epi8(a, b, _mm_cmpgt_epi32(a, b)); |
421 | } | |
c017a39f | 422 | static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu8_epi16(__m128i epu8) { |
f22341db | 423 | return _mm_unpacklo_epi8(epu8, _mm_setzero_si128()); |
424 | } | |
c017a39f | 425 | static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi8_epi16(__m128i epi8) { |
f22341db | 426 | return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128())); |
427 | } | |
c017a39f | 428 | static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu16_epi32(__m128i epu16) { |
f22341db | 429 | return _mm_unpacklo_epi16(epu16, _mm_setzero_si128()); |
430 | } | |
c017a39f | 431 | static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi16_epi32(__m128i epu16) { |
f22341db | 432 | return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128())); |
433 | } | |
c017a39f | 434 | static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepu8_epi32(__m128i epu8) { |
f22341db | 435 | return _mm_cvtepu16_epi32(_mm_cvtepu8_epi16(epu8)); |
436 | } | |
c017a39f | 437 | static Vc_INTRINSIC Vc_CONST __m128i _mm_cvtepi8_epi32(__m128i epi8) { |
f22341db | 438 | const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128()); |
439 | const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg); | |
440 | return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg)); | |
441 | } | |
c017a39f | 442 | static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load_si128(__m128i *mem) { |
f22341db | 443 | return _mm_load_si128(mem); |
444 | } | |
445 | ||
446 | } // namespace SSE | |
447 | } // namespace Vc | |
c017a39f | 448 | } // namespace AliRoot |
449 | #endif | |
450 | ||
451 | #ifdef VC_IMPL_POPCNT | |
452 | #include <popcntintrin.h> | |
f22341db | 453 | #endif |
454 | ||
455 | // SSE4.2 | |
456 | #ifdef VC_IMPL_SSE4_2 | |
c017a39f | 457 | extern "C" { |
f22341db | 458 | #include <nmmintrin.h> |
c017a39f | 459 | } |
f22341db | 460 | #elif defined _NMMINTRIN_H_INCLUDED |
461 | #error "SSE4.2 was disabled but something includes <nmmintrin.h>. Please fix your code." | |
462 | #endif | |
463 | ||
c017a39f | 464 | namespace AliRoot { |
f22341db | 465 | namespace Vc |
466 | { | |
467 | namespace SSE | |
468 | { | |
c017a39f | 469 | static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i) { |
f22341db | 470 | float f; |
471 | switch (i) { | |
472 | case 0: | |
473 | f = _mm_cvtss_f32(v); | |
474 | break; | |
475 | #if defined VC_IMPL_SSE4_1 && !defined VC_MSVC | |
476 | default: | |
477 | #ifdef VC_GCC | |
478 | f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i)); | |
479 | #else | |
480 | // MSVC fails to compile this because it can't optimize i to an immediate | |
481 | _MM_EXTRACT_FLOAT(f, v, i); | |
482 | #endif | |
483 | break; | |
484 | #else | |
485 | case 1: | |
486 | f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4))); | |
487 | break; | |
488 | case 2: | |
489 | f = _mm_cvtss_f32(_mm_movehl_ps(v, v)); | |
490 | break; | |
491 | case 3: | |
492 | f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12))); | |
493 | break; | |
494 | #endif | |
495 | } | |
496 | return f; | |
497 | } | |
c017a39f | 498 | static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i) { |
f22341db | 499 | if (i == 0) { |
500 | return _mm_cvtsd_f64(v); | |
501 | } | |
502 | return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v)))); | |
503 | } | |
c017a39f | 504 | static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i) { |
f22341db | 505 | #ifdef VC_GCC |
506 | if (__builtin_constant_p(i)) { | |
507 | return extract_float_imm(v, i); | |
508 | //X if (index <= 1) { | |
509 | //X unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v)); | |
510 | //X if (index == 0) tmp &= 0xFFFFFFFFull; | |
511 | //X if (index == 1) tmp >>= 32; | |
512 | //X return Common::AliasingEntryHelper<EntryType>(tmp); | |
513 | //X } | |
514 | } else { | |
c017a39f | 515 | typedef float float4[4] Vc_MAY_ALIAS; |
f22341db | 516 | const float4 &data = reinterpret_cast<const float4 &>(v); |
517 | return data[i]; | |
518 | } | |
519 | #else | |
520 | union { __m128 v; float m[4]; } u; | |
521 | u.v = v; | |
522 | return u.m[i]; | |
523 | #endif | |
524 | } | |
525 | ||
c017a39f | 526 | static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) { |
f22341db | 527 | #ifdef VC_IMPL_SSE4_1 |
528 | return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem)))); | |
529 | #else | |
530 | return _mm_load_ps(mem); | |
531 | #endif | |
532 | } | |
c017a39f | 533 | static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) { |
f22341db | 534 | #ifdef VC_IMPL_SSE4_1 |
535 | return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem)))); | |
536 | #else | |
537 | return _mm_load_pd(mem); | |
538 | #endif | |
539 | } | |
c017a39f | 540 | static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) { |
f22341db | 541 | #ifdef VC_IMPL_SSE4_1 |
542 | return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem))); | |
543 | #else | |
544 | return _mm_load_si128(reinterpret_cast<const __m128i *>(mem)); | |
545 | #endif | |
546 | } | |
c017a39f | 547 | static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) { |
f22341db | 548 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); |
549 | } | |
c017a39f | 550 | static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) { |
f22341db | 551 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); |
552 | } | |
c017a39f | 553 | static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) { |
f22341db | 554 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); |
555 | } | |
c017a39f | 556 | static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) { |
f22341db | 557 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); |
558 | } | |
c017a39f | 559 | static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) { |
f22341db | 560 | return _mm_stream_load(reinterpret_cast<const int *>(mem)); |
561 | } | |
562 | } // namespace SSE | |
563 | } // namespace Vc | |
c017a39f | 564 | } // namespace AliRoot |
565 | ||
566 | // XOP / FMA4 | |
567 | #if defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4) | |
568 | extern "C" { | |
569 | #include <x86intrin.h> | |
570 | } | |
571 | #endif | |
f22341db | 572 | |
c017a39f | 573 | #include "undomacros.h" |
f22341db | 574 | #include "shuffle.h" |
575 | ||
576 | #endif // SSE_INTRINSICS_H |