]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #ifndef AVX_CASTS_H | |
21 | #define AVX_CASTS_H | |
22 | ||
23 | #include "intrinsics.h" | |
24 | #include "types.h" | |
c017a39f | 25 | #include "macros.h" |
f22341db | 26 | |
c017a39f | 27 | namespace AliRoot { |
f22341db | 28 | namespace Vc |
29 | { | |
30 | namespace AVX | |
31 | { | |
c017a39f | 32 | template<typename T> static Vc_INTRINSIC_L T avx_cast(param128 v) Vc_INTRINSIC_R; |
33 | template<typename T> static Vc_INTRINSIC_L T avx_cast(param128i v) Vc_INTRINSIC_R; | |
34 | template<typename T> static Vc_INTRINSIC_L T avx_cast(param128d v) Vc_INTRINSIC_R; | |
35 | template<typename T> static Vc_INTRINSIC_L T avx_cast(param256 v) Vc_INTRINSIC_R; | |
36 | template<typename T> static Vc_INTRINSIC_L T avx_cast(param256i v) Vc_INTRINSIC_R; | |
37 | template<typename T> static Vc_INTRINSIC_L T avx_cast(param256d v) Vc_INTRINSIC_R; | |
38 | ||
39 | #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS | |
40 | template<typename T> static Vc_INTRINSIC T avx_cast(__m128 v) { return avx_cast<T>(param128 (v)); } | |
41 | template<typename T> static Vc_INTRINSIC T avx_cast(__m128i v) { return avx_cast<T>(param128i(v)); } | |
42 | template<typename T> static Vc_INTRINSIC T avx_cast(__m128d v) { return avx_cast<T>(param128d(v)); } | |
43 | template<typename T> static Vc_INTRINSIC T avx_cast(__m256 v) { return avx_cast<T>(param256 (v)); } | |
44 | template<typename T> static Vc_INTRINSIC T avx_cast(__m256i v) { return avx_cast<T>(param256i(v)); } | |
45 | template<typename T> static Vc_INTRINSIC T avx_cast(__m256d v) { return avx_cast<T>(param256d(v)); } | |
46 | #endif | |
f22341db | 47 | |
48 | // 128 -> 128 | |
c017a39f | 49 | template<> Vc_INTRINSIC m128 avx_cast(param128 v) { return v; } |
50 | template<> Vc_INTRINSIC m128 avx_cast(param128i v) { return _mm_castsi128_ps(v); } | |
51 | template<> Vc_INTRINSIC m128 avx_cast(param128d v) { return _mm_castpd_ps(v); } | |
52 | template<> Vc_INTRINSIC m128i avx_cast(param128 v) { return _mm_castps_si128(v); } | |
53 | template<> Vc_INTRINSIC m128i avx_cast(param128i v) { return v; } | |
54 | template<> Vc_INTRINSIC m128i avx_cast(param128d v) { return _mm_castpd_si128(v); } | |
55 | template<> Vc_INTRINSIC m128d avx_cast(param128 v) { return _mm_castps_pd(v); } | |
56 | template<> Vc_INTRINSIC m128d avx_cast(param128i v) { return _mm_castsi128_pd(v); } | |
57 | template<> Vc_INTRINSIC m128d avx_cast(param128d v) { return v; } | |
f22341db | 58 | |
59 | // 128 -> 256 | |
c017a39f | 60 | // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never |
61 | // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's | |
62 | // upper 128bits are zero. Thus using the same register as AVX register will have the upper | |
63 | // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory | |
64 | // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do | |
65 | // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck, | |
66 | // do we really want to rely on specific compiler behavior here? | |
67 | template<> Vc_INTRINSIC m256 avx_cast(param128 v) { return _mm256_castps128_ps256(v); } | |
68 | template<> Vc_INTRINSIC m256 avx_cast(param128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); } | |
69 | template<> Vc_INTRINSIC m256 avx_cast(param128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); } | |
70 | template<> Vc_INTRINSIC m256i avx_cast(param128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); } | |
71 | template<> Vc_INTRINSIC m256i avx_cast(param128i v) { return _mm256_castsi128_si256(v); } | |
72 | template<> Vc_INTRINSIC m256i avx_cast(param128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); } | |
73 | template<> Vc_INTRINSIC m256d avx_cast(param128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); } | |
74 | template<> Vc_INTRINSIC m256d avx_cast(param128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); } | |
75 | template<> Vc_INTRINSIC m256d avx_cast(param128d v) { return _mm256_castpd128_pd256(v); } | |
76 | ||
77 | #ifdef VC_MSVC | |
78 | static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); } | |
79 | static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); } | |
80 | static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); } | |
81 | #else | |
82 | static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_castps128_ps256(v); } | |
83 | static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_castsi128_si256(v); } | |
84 | static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_castpd128_pd256(v); } | |
85 | #ifdef VC_ICC | |
86 | static Vc_INTRINSIC Vc_CONST m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); } | |
87 | static Vc_INTRINSIC Vc_CONST m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); } | |
88 | static Vc_INTRINSIC Vc_CONST m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); } | |
89 | #endif | |
90 | #endif | |
f22341db | 91 | |
92 | // 256 -> 128 | |
c017a39f | 93 | template<> Vc_INTRINSIC m128 avx_cast(param256 v) { return _mm256_castps256_ps128(v); } |
94 | template<> Vc_INTRINSIC m128 avx_cast(param256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); } | |
95 | template<> Vc_INTRINSIC m128 avx_cast(param256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); } | |
96 | template<> Vc_INTRINSIC m128i avx_cast(param256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); } | |
97 | template<> Vc_INTRINSIC m128i avx_cast(param256i v) { return _mm256_castsi256_si128(v); } | |
98 | template<> Vc_INTRINSIC m128i avx_cast(param256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); } | |
99 | template<> Vc_INTRINSIC m128d avx_cast(param256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); } | |
100 | template<> Vc_INTRINSIC m128d avx_cast(param256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); } | |
101 | template<> Vc_INTRINSIC m128d avx_cast(param256d v) { return _mm256_castpd256_pd128(v); } | |
f22341db | 102 | |
103 | // 256 -> 256 | |
c017a39f | 104 | template<> Vc_INTRINSIC m256 avx_cast(param256 v) { return v; } |
105 | template<> Vc_INTRINSIC m256 avx_cast(param256i v) { return _mm256_castsi256_ps(v); } | |
106 | template<> Vc_INTRINSIC m256 avx_cast(param256d v) { return _mm256_castpd_ps(v); } | |
107 | template<> Vc_INTRINSIC m256i avx_cast(param256 v) { return _mm256_castps_si256(v); } | |
108 | template<> Vc_INTRINSIC m256i avx_cast(param256i v) { return v; } | |
109 | template<> Vc_INTRINSIC m256i avx_cast(param256d v) { return _mm256_castpd_si256(v); } | |
110 | template<> Vc_INTRINSIC m256d avx_cast(param256 v) { return _mm256_castps_pd(v); } | |
111 | template<> Vc_INTRINSIC m256d avx_cast(param256i v) { return _mm256_castsi256_pd(v); } | |
112 | template<> Vc_INTRINSIC m256d avx_cast(param256d v) { return v; } | |
f22341db | 113 | |
114 | // simplify splitting 256-bit registers in 128-bit registers | |
c017a39f | 115 | Vc_INTRINSIC Vc_CONST m128 lo128(param256 v) { return avx_cast<m128>(v); } |
116 | Vc_INTRINSIC Vc_CONST m128d lo128(param256d v) { return avx_cast<m128d>(v); } | |
117 | Vc_INTRINSIC Vc_CONST m128i lo128(param256i v) { return avx_cast<m128i>(v); } | |
118 | Vc_INTRINSIC Vc_CONST m128 hi128(param256 v) { return _mm256_extractf128_ps(v, 1); } | |
119 | Vc_INTRINSIC Vc_CONST m128d hi128(param256d v) { return _mm256_extractf128_pd(v, 1); } | |
120 | Vc_INTRINSIC Vc_CONST m128i hi128(param256i v) { return _mm256_extractf128_si256(v, 1); } | |
f22341db | 121 | |
122 | // simplify combining 128-bit registers in 256-bit registers | |
c017a39f | 123 | Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); } |
124 | Vc_INTRINSIC Vc_CONST m256d concat(param128d a, param128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); } | |
125 | Vc_INTRINSIC Vc_CONST m256i concat(param128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); } | |
126 | #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS | |
127 | Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, param128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); } | |
128 | Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, param128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); } | |
129 | Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); } | |
130 | Vc_INTRINSIC Vc_CONST m256 concat(param128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); } | |
131 | Vc_INTRINSIC Vc_CONST m256d concat(param128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); } | |
132 | Vc_INTRINSIC Vc_CONST m256i concat(param128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); } | |
133 | Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); } | |
134 | Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); } | |
135 | Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); } | |
136 | #endif | |
f22341db | 137 | |
138 | template<typename From, typename To> struct StaticCastHelper {}; | |
c017a39f | 139 | template<> struct StaticCastHelper<float , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256 v) { return _mm256_cvttps_epi32(v); } }; |
140 | template<> struct StaticCastHelper<double , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } }; | |
141 | template<> struct StaticCastHelper<int , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; | |
142 | template<> struct StaticCastHelper<unsigned int , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; | |
143 | template<> struct StaticCastHelper<short , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)); } }; | |
144 | template<> struct StaticCastHelper<float , unsigned int > { static inline Vc_CONST m256i cast(param256 v) { | |
f22341db | 145 | return _mm256_castps_si256(_mm256_blendv_ps( |
146 | _mm256_castsi256_ps(_mm256_cvttps_epi32(v)), | |
c017a39f | 147 | _mm256_castsi256_ps(_mm256_add_epi32(m256i(_mm256_cvttps_epi32(_mm256_sub_ps(v, _mm256_set2power31_ps()))), _mm256_set2power31_epu32())), |
f22341db | 148 | _mm256_cmpge_ps(v, _mm256_set2power31_ps()) |
149 | )); | |
150 | ||
151 | } }; | |
c017a39f | 152 | template<> struct StaticCastHelper<double , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } }; |
153 | template<> struct StaticCastHelper<int , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; | |
154 | template<> struct StaticCastHelper<unsigned int , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; | |
155 | template<> struct StaticCastHelper<unsigned short, unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srli_epi32(_mm_unpackhi_epi16(v, v), 16)); } }; | |
156 | template<> struct StaticCastHelper<float , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256 v) { return v; } }; | |
157 | template<> struct StaticCastHelper<double , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256d v) { return avx_cast<m256>(_mm256_cvtpd_ps(v)); } }; | |
158 | template<> struct StaticCastHelper<int , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256i v) { return _mm256_cvtepi32_ps(v); } }; | |
159 | template<> struct StaticCastHelper<unsigned int , float > { static inline Vc_CONST m256 cast(param256i v) { | |
f22341db | 160 | return _mm256_blendv_ps( |
161 | _mm256_cvtepi32_ps(v), | |
162 | _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_sub_epi32(v, _mm256_set2power31_epu32())), _mm256_set2power31_ps()), | |
163 | _mm256_castsi256_ps(_mm256_cmplt_epi32(v, _mm256_setzero_si256())) | |
164 | ); | |
165 | } }; | |
c017a39f | 166 | template<> struct StaticCastHelper<short , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } }; |
167 | template<> struct StaticCastHelper<unsigned short, float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } }; | |
168 | template<> struct StaticCastHelper<float , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256 v) { return _mm256_cvtps_pd(avx_cast<m128>(v)); } }; | |
169 | template<> struct StaticCastHelper<double , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v) { return v; } }; | |
170 | template<> struct StaticCastHelper<int , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } }; | |
171 | template<> struct StaticCastHelper<unsigned int , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } }; | |
172 | template<> struct StaticCastHelper<int , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packs_epi32(lo128(v), hi128(v)); } }; | |
173 | template<> struct StaticCastHelper<float , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<int, short>::cast(StaticCastHelper<float, int>::cast(v)); } }; | |
174 | template<> struct StaticCastHelper<short , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; | |
175 | template<> struct StaticCastHelper<unsigned short, short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; | |
176 | template<> struct StaticCastHelper<unsigned int , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packus_epi32(lo128(v), hi128(v)); } }; | |
177 | template<> struct StaticCastHelper<float , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<unsigned int, unsigned short>::cast(StaticCastHelper<float, unsigned int>::cast(v)); } }; | |
178 | template<> struct StaticCastHelper<short , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; | |
179 | template<> struct StaticCastHelper<unsigned short, unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; | |
180 | template<> struct StaticCastHelper<sfloat , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<int, short>::cast(StaticCastHelper<float, int>::cast(v)); } }; | |
181 | template<> struct StaticCastHelper<sfloat , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<unsigned int, unsigned short>::cast(StaticCastHelper<float, unsigned int>::cast(v)); } }; | |
182 | template<> struct StaticCastHelper<short , sfloat > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } }; | |
183 | template<> struct StaticCastHelper<unsigned short, sfloat > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } }; | |
f22341db | 184 | } // namespace AVX |
185 | } // namespace Vc | |
c017a39f | 186 | } // namespace AliRoot |
187 | ||
188 | #include "undomacros.h" | |
f22341db | 189 | |
190 | #endif // AVX_CASTS_H |