]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/include/Vc/avx/casts.h
update to Vc 0.7.3-dev
[u/mrichter/AliRoot.git] / Vc / include / Vc / avx / casts.h
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
20#ifndef AVX_CASTS_H
21#define AVX_CASTS_H
22
23#include "intrinsics.h"
24#include "types.h"
c017a39f 25#include "macros.h"
f22341db 26
c017a39f 27namespace AliRoot {
f22341db 28namespace Vc
29{
30namespace AVX
31{
c017a39f 32 template<typename T> static Vc_INTRINSIC_L T avx_cast(param128 v) Vc_INTRINSIC_R;
33 template<typename T> static Vc_INTRINSIC_L T avx_cast(param128i v) Vc_INTRINSIC_R;
34 template<typename T> static Vc_INTRINSIC_L T avx_cast(param128d v) Vc_INTRINSIC_R;
35 template<typename T> static Vc_INTRINSIC_L T avx_cast(param256 v) Vc_INTRINSIC_R;
36 template<typename T> static Vc_INTRINSIC_L T avx_cast(param256i v) Vc_INTRINSIC_R;
37 template<typename T> static Vc_INTRINSIC_L T avx_cast(param256d v) Vc_INTRINSIC_R;
38
39#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
40 template<typename T> static Vc_INTRINSIC T avx_cast(__m128 v) { return avx_cast<T>(param128 (v)); }
41 template<typename T> static Vc_INTRINSIC T avx_cast(__m128i v) { return avx_cast<T>(param128i(v)); }
42 template<typename T> static Vc_INTRINSIC T avx_cast(__m128d v) { return avx_cast<T>(param128d(v)); }
43 template<typename T> static Vc_INTRINSIC T avx_cast(__m256 v) { return avx_cast<T>(param256 (v)); }
44 template<typename T> static Vc_INTRINSIC T avx_cast(__m256i v) { return avx_cast<T>(param256i(v)); }
45 template<typename T> static Vc_INTRINSIC T avx_cast(__m256d v) { return avx_cast<T>(param256d(v)); }
46#endif
f22341db 47
48 // 128 -> 128
c017a39f 49 template<> Vc_INTRINSIC m128 avx_cast(param128 v) { return v; }
50 template<> Vc_INTRINSIC m128 avx_cast(param128i v) { return _mm_castsi128_ps(v); }
51 template<> Vc_INTRINSIC m128 avx_cast(param128d v) { return _mm_castpd_ps(v); }
52 template<> Vc_INTRINSIC m128i avx_cast(param128 v) { return _mm_castps_si128(v); }
53 template<> Vc_INTRINSIC m128i avx_cast(param128i v) { return v; }
54 template<> Vc_INTRINSIC m128i avx_cast(param128d v) { return _mm_castpd_si128(v); }
55 template<> Vc_INTRINSIC m128d avx_cast(param128 v) { return _mm_castps_pd(v); }
56 template<> Vc_INTRINSIC m128d avx_cast(param128i v) { return _mm_castsi128_pd(v); }
57 template<> Vc_INTRINSIC m128d avx_cast(param128d v) { return v; }
f22341db 58
59 // 128 -> 256
c017a39f 60 // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never
61 // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's
62 // upper 128bits are zero. Thus using the same register as AVX register will have the upper
63 // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory
64 // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do
65 // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck,
66 // do we really want to rely on specific compiler behavior here?
67 template<> Vc_INTRINSIC m256 avx_cast(param128 v) { return _mm256_castps128_ps256(v); }
68 template<> Vc_INTRINSIC m256 avx_cast(param128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
69 template<> Vc_INTRINSIC m256 avx_cast(param128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
70 template<> Vc_INTRINSIC m256i avx_cast(param128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
71 template<> Vc_INTRINSIC m256i avx_cast(param128i v) { return _mm256_castsi128_si256(v); }
72 template<> Vc_INTRINSIC m256i avx_cast(param128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
73 template<> Vc_INTRINSIC m256d avx_cast(param128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
74 template<> Vc_INTRINSIC m256d avx_cast(param128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
75 template<> Vc_INTRINSIC m256d avx_cast(param128d v) { return _mm256_castpd128_pd256(v); }
76
77#ifdef VC_MSVC
78 static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
79 static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
80 static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
81#else
82 static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_castps128_ps256(v); }
83 static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_castsi128_si256(v); }
84 static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_castpd128_pd256(v); }
85#ifdef VC_ICC
86 static Vc_INTRINSIC Vc_CONST m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); }
87 static Vc_INTRINSIC Vc_CONST m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
88 static Vc_INTRINSIC Vc_CONST m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
89#endif
90#endif
f22341db 91
92 // 256 -> 128
c017a39f 93 template<> Vc_INTRINSIC m128 avx_cast(param256 v) { return _mm256_castps256_ps128(v); }
94 template<> Vc_INTRINSIC m128 avx_cast(param256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
95 template<> Vc_INTRINSIC m128 avx_cast(param256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
96 template<> Vc_INTRINSIC m128i avx_cast(param256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
97 template<> Vc_INTRINSIC m128i avx_cast(param256i v) { return _mm256_castsi256_si128(v); }
98 template<> Vc_INTRINSIC m128i avx_cast(param256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
99 template<> Vc_INTRINSIC m128d avx_cast(param256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
100 template<> Vc_INTRINSIC m128d avx_cast(param256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
101 template<> Vc_INTRINSIC m128d avx_cast(param256d v) { return _mm256_castpd256_pd128(v); }
f22341db 102
103 // 256 -> 256
c017a39f 104 template<> Vc_INTRINSIC m256 avx_cast(param256 v) { return v; }
105 template<> Vc_INTRINSIC m256 avx_cast(param256i v) { return _mm256_castsi256_ps(v); }
106 template<> Vc_INTRINSIC m256 avx_cast(param256d v) { return _mm256_castpd_ps(v); }
107 template<> Vc_INTRINSIC m256i avx_cast(param256 v) { return _mm256_castps_si256(v); }
108 template<> Vc_INTRINSIC m256i avx_cast(param256i v) { return v; }
109 template<> Vc_INTRINSIC m256i avx_cast(param256d v) { return _mm256_castpd_si256(v); }
110 template<> Vc_INTRINSIC m256d avx_cast(param256 v) { return _mm256_castps_pd(v); }
111 template<> Vc_INTRINSIC m256d avx_cast(param256i v) { return _mm256_castsi256_pd(v); }
112 template<> Vc_INTRINSIC m256d avx_cast(param256d v) { return v; }
f22341db 113
114 // simplify splitting 256-bit registers in 128-bit registers
c017a39f 115 Vc_INTRINSIC Vc_CONST m128 lo128(param256 v) { return avx_cast<m128>(v); }
116 Vc_INTRINSIC Vc_CONST m128d lo128(param256d v) { return avx_cast<m128d>(v); }
117 Vc_INTRINSIC Vc_CONST m128i lo128(param256i v) { return avx_cast<m128i>(v); }
118 Vc_INTRINSIC Vc_CONST m128 hi128(param256 v) { return _mm256_extractf128_ps(v, 1); }
119 Vc_INTRINSIC Vc_CONST m128d hi128(param256d v) { return _mm256_extractf128_pd(v, 1); }
120 Vc_INTRINSIC Vc_CONST m128i hi128(param256i v) { return _mm256_extractf128_si256(v, 1); }
f22341db 121
122 // simplify combining 128-bit registers in 256-bit registers
c017a39f 123 Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
124 Vc_INTRINSIC Vc_CONST m256d concat(param128d a, param128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
125 Vc_INTRINSIC Vc_CONST m256i concat(param128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
126#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
127 Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, param128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
128 Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, param128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
129 Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
130 Vc_INTRINSIC Vc_CONST m256 concat(param128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
131 Vc_INTRINSIC Vc_CONST m256d concat(param128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
132 Vc_INTRINSIC Vc_CONST m256i concat(param128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
133 Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
134 Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
135 Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
136#endif
f22341db 137
138 template<typename From, typename To> struct StaticCastHelper {};
c017a39f 139 template<> struct StaticCastHelper<float , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256 v) { return _mm256_cvttps_epi32(v); } };
140 template<> struct StaticCastHelper<double , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } };
141 template<> struct StaticCastHelper<int , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
142 template<> struct StaticCastHelper<unsigned int , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
143 template<> struct StaticCastHelper<short , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)); } };
144 template<> struct StaticCastHelper<float , unsigned int > { static inline Vc_CONST m256i cast(param256 v) {
f22341db 145 return _mm256_castps_si256(_mm256_blendv_ps(
146 _mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
c017a39f 147 _mm256_castsi256_ps(_mm256_add_epi32(m256i(_mm256_cvttps_epi32(_mm256_sub_ps(v, _mm256_set2power31_ps()))), _mm256_set2power31_epu32())),
f22341db 148 _mm256_cmpge_ps(v, _mm256_set2power31_ps())
149 ));
150
151 } };
c017a39f 152 template<> struct StaticCastHelper<double , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } };
153 template<> struct StaticCastHelper<int , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
154 template<> struct StaticCastHelper<unsigned int , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
155 template<> struct StaticCastHelper<unsigned short, unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srli_epi32(_mm_unpackhi_epi16(v, v), 16)); } };
156 template<> struct StaticCastHelper<float , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256 v) { return v; } };
157 template<> struct StaticCastHelper<double , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256d v) { return avx_cast<m256>(_mm256_cvtpd_ps(v)); } };
158 template<> struct StaticCastHelper<int , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256i v) { return _mm256_cvtepi32_ps(v); } };
159 template<> struct StaticCastHelper<unsigned int , float > { static inline Vc_CONST m256 cast(param256i v) {
f22341db 160 return _mm256_blendv_ps(
161 _mm256_cvtepi32_ps(v),
162 _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_sub_epi32(v, _mm256_set2power31_epu32())), _mm256_set2power31_ps()),
163 _mm256_castsi256_ps(_mm256_cmplt_epi32(v, _mm256_setzero_si256()))
164 );
165 } };
c017a39f 166 template<> struct StaticCastHelper<short , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } };
167 template<> struct StaticCastHelper<unsigned short, float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } };
168 template<> struct StaticCastHelper<float , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256 v) { return _mm256_cvtps_pd(avx_cast<m128>(v)); } };
169 template<> struct StaticCastHelper<double , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v) { return v; } };
170 template<> struct StaticCastHelper<int , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } };
171 template<> struct StaticCastHelper<unsigned int , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } };
172 template<> struct StaticCastHelper<int , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packs_epi32(lo128(v), hi128(v)); } };
173 template<> struct StaticCastHelper<float , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<int, short>::cast(StaticCastHelper<float, int>::cast(v)); } };
174 template<> struct StaticCastHelper<short , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
175 template<> struct StaticCastHelper<unsigned short, short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
176 template<> struct StaticCastHelper<unsigned int , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packus_epi32(lo128(v), hi128(v)); } };
177 template<> struct StaticCastHelper<float , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<unsigned int, unsigned short>::cast(StaticCastHelper<float, unsigned int>::cast(v)); } };
178 template<> struct StaticCastHelper<short , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
179 template<> struct StaticCastHelper<unsigned short, unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
180 template<> struct StaticCastHelper<sfloat , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<int, short>::cast(StaticCastHelper<float, int>::cast(v)); } };
181 template<> struct StaticCastHelper<sfloat , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<unsigned int, unsigned short>::cast(StaticCastHelper<float, unsigned int>::cast(v)); } };
182 template<> struct StaticCastHelper<short , sfloat > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } };
183 template<> struct StaticCastHelper<unsigned short, sfloat > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } };
f22341db 184} // namespace AVX
185} // namespace Vc
c017a39f 186} // namespace AliRoot
187
188#include "undomacros.h"
f22341db 189
190#endif // AVX_CASTS_H