[u/mrichter/AliRoot.git] / Vc / include / Vc / avx / casts.h

/*  This file is part of the Vc library.

    Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>

    Vc is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as
    published by the Free Software Foundation, either version 3 of
    the License, or (at your option) any later version.

    Vc is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with Vc.  If not, see <http://www.gnu.org/licenses/>.

*/

#ifndef AVX_CASTS_H
#define AVX_CASTS_H

#include "intrinsics.h"
#include "types.h"
#include "macros.h"

namespace AliRoot {
namespace Vc
{
namespace AVX
{
    template<typename T> static Vc_INTRINSIC_L T avx_cast(param128  v) Vc_INTRINSIC_R;
    template<typename T> static Vc_INTRINSIC_L T avx_cast(param128i v) Vc_INTRINSIC_R;
    template<typename T> static Vc_INTRINSIC_L T avx_cast(param128d v) Vc_INTRINSIC_R;
    template<typename T> static Vc_INTRINSIC_L T avx_cast(param256  v) Vc_INTRINSIC_R;
    template<typename T> static Vc_INTRINSIC_L T avx_cast(param256i v) Vc_INTRINSIC_R;
    template<typename T> static Vc_INTRINSIC_L T avx_cast(param256d v) Vc_INTRINSIC_R;

#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
    template<typename T> static Vc_INTRINSIC T avx_cast(__m128  v) { return avx_cast<T>(param128 (v)); }
    template<typename T> static Vc_INTRINSIC T avx_cast(__m128i v) { return avx_cast<T>(param128i(v)); }
    template<typename T> static Vc_INTRINSIC T avx_cast(__m128d v) { return avx_cast<T>(param128d(v)); }
    template<typename T> static Vc_INTRINSIC T avx_cast(__m256  v) { return avx_cast<T>(param256 (v)); }
    template<typename T> static Vc_INTRINSIC T avx_cast(__m256i v) { return avx_cast<T>(param256i(v)); }
    template<typename T> static Vc_INTRINSIC T avx_cast(__m256d v) { return avx_cast<T>(param256d(v)); }
#endif

    // 128 -> 128
    template<> Vc_INTRINSIC m128  avx_cast(param128  v) { return v; }
    template<> Vc_INTRINSIC m128  avx_cast(param128i v) { return _mm_castsi128_ps(v); }
    template<> Vc_INTRINSIC m128  avx_cast(param128d v) { return _mm_castpd_ps(v); }
    template<> Vc_INTRINSIC m128i avx_cast(param128  v) { return _mm_castps_si128(v); }
    template<> Vc_INTRINSIC m128i avx_cast(param128i v) { return v; }
    template<> Vc_INTRINSIC m128i avx_cast(param128d v) { return _mm_castpd_si128(v); }
    template<> Vc_INTRINSIC m128d avx_cast(param128  v) { return _mm_castps_pd(v); }
    template<> Vc_INTRINSIC m128d avx_cast(param128i v) { return _mm_castsi128_pd(v); }
    template<> Vc_INTRINSIC m128d avx_cast(param128d v) { return v; }

    // 128 -> 256
    // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never
    // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's
    // upper 128bits are zero. Thus using the same register as AVX register will have the upper
    // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory
    // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do
    // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck,
    // do we really want to rely on specific compiler behavior here?
    template<> Vc_INTRINSIC m256  avx_cast(param128  v) { return _mm256_castps128_ps256(v); }
    template<> Vc_INTRINSIC m256  avx_cast(param128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
    template<> Vc_INTRINSIC m256  avx_cast(param128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
    template<> Vc_INTRINSIC m256i avx_cast(param128  v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
    template<> Vc_INTRINSIC m256i avx_cast(param128i v) { return _mm256_castsi128_si256(v); }
    template<> Vc_INTRINSIC m256i avx_cast(param128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
    template<> Vc_INTRINSIC m256d avx_cast(param128  v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
    template<> Vc_INTRINSIC m256d avx_cast(param128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
    template<> Vc_INTRINSIC m256d avx_cast(param128d v) { return _mm256_castpd128_pd256(v); }

#ifdef VC_MSVC
    static Vc_INTRINSIC Vc_CONST m256  zeroExtend(param128  v) { return _mm256_permute2f128_ps   (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
    static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
    static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_permute2f128_pd   (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
#else
    static Vc_INTRINSIC Vc_CONST m256  zeroExtend(param128  v) { return _mm256_castps128_ps256(v); }
    static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_castsi128_si256(v); }
    static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_castpd128_pd256(v); }
#ifdef VC_ICC
    static Vc_INTRINSIC Vc_CONST m256  zeroExtend(__m128  v) { return _mm256_castps128_ps256(v); }
    static Vc_INTRINSIC Vc_CONST m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
    static Vc_INTRINSIC Vc_CONST m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
#endif
#endif

    // 256 -> 128
    template<> Vc_INTRINSIC m128  avx_cast(param256  v) { return _mm256_castps256_ps128(v); }
    template<> Vc_INTRINSIC m128  avx_cast(param256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
    template<> Vc_INTRINSIC m128  avx_cast(param256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
    template<> Vc_INTRINSIC m128i avx_cast(param256  v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
    template<> Vc_INTRINSIC m128i avx_cast(param256i v) { return _mm256_castsi256_si128(v); }
    template<> Vc_INTRINSIC m128i avx_cast(param256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
    template<> Vc_INTRINSIC m128d avx_cast(param256  v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
    template<> Vc_INTRINSIC m128d avx_cast(param256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
    template<> Vc_INTRINSIC m128d avx_cast(param256d v) { return _mm256_castpd256_pd128(v); }

    // 256 -> 256
    template<> Vc_INTRINSIC m256  avx_cast(param256  v) { return v; }
    template<> Vc_INTRINSIC m256  avx_cast(param256i v) { return _mm256_castsi256_ps(v); }
    template<> Vc_INTRINSIC m256  avx_cast(param256d v) { return _mm256_castpd_ps(v); }
    template<> Vc_INTRINSIC m256i avx_cast(param256  v) { return _mm256_castps_si256(v); }
    template<> Vc_INTRINSIC m256i avx_cast(param256i v) { return v; }
    template<> Vc_INTRINSIC m256i avx_cast(param256d v) { return _mm256_castpd_si256(v); }
    template<> Vc_INTRINSIC m256d avx_cast(param256  v) { return _mm256_castps_pd(v); }
    template<> Vc_INTRINSIC m256d avx_cast(param256i v) { return _mm256_castsi256_pd(v); }
    template<> Vc_INTRINSIC m256d avx_cast(param256d v) { return v; }

    // simplify splitting 256-bit registers in 128-bit registers
    Vc_INTRINSIC Vc_CONST m128  lo128(param256  v) { return avx_cast<m128>(v); }
    Vc_INTRINSIC Vc_CONST m128d lo128(param256d v) { return avx_cast<m128d>(v); }
    Vc_INTRINSIC Vc_CONST m128i lo128(param256i v) { return avx_cast<m128i>(v); }
    Vc_INTRINSIC Vc_CONST m128  hi128(param256  v) { return _mm256_extractf128_ps(v, 1); }
    Vc_INTRINSIC Vc_CONST m128d hi128(param256d v) { return _mm256_extractf128_pd(v, 1); }
    Vc_INTRINSIC Vc_CONST m128i hi128(param256i v) { return _mm256_extractf128_si256(v, 1); }

    // simplify combining 128-bit registers in 256-bit registers
    Vc_INTRINSIC Vc_CONST m256  concat(param128  a, param128  b) { return _mm256_insertf128_ps   (avx_cast<m256 >(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256d concat(param128d a, param128d b) { return _mm256_insertf128_pd   (avx_cast<m256d>(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256i concat(param128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
    Vc_INTRINSIC Vc_CONST m256  concat(__m128  a, param128  b) { return _mm256_insertf128_ps   (avx_cast<m256 >(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, param128d b) { return _mm256_insertf128_pd   (avx_cast<m256d>(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256  concat(param128  a, __m128  b) { return _mm256_insertf128_ps   (avx_cast<m256 >(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256d concat(param128d a, __m128d b) { return _mm256_insertf128_pd   (avx_cast<m256d>(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256i concat(param128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256  concat(__m128  a, __m128  b) { return _mm256_insertf128_ps   (avx_cast<m256 >(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, __m128d b) { return _mm256_insertf128_pd   (avx_cast<m256d>(a), b, 1); }
    Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
#endif

    template<typename From, typename To> struct StaticCastHelper {};
    template<> struct StaticCastHelper<float         , int           > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param256  v) { return _mm256_cvttps_epi32(v); } };
    template<> struct StaticCastHelper<double        , int           > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } };
    template<> struct StaticCastHelper<int           , int           > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param256i v) { return v; } };
    template<> struct StaticCastHelper<unsigned int  , int           > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param256i v) { return v; } };
    template<> struct StaticCastHelper<short         , int           > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param128i   v) { return concat(_mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)); } };
    template<> struct StaticCastHelper<float         , unsigned int  > { static inline Vc_CONST m256i  cast(param256  v) {
        return _mm256_castps_si256(_mm256_blendv_ps(
                _mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
                _mm256_castsi256_ps(_mm256_add_epi32(m256i(_mm256_cvttps_epi32(_mm256_sub_ps(v, _mm256_set2power31_ps()))), _mm256_set2power31_epu32())),
                _mm256_cmpge_ps(v, _mm256_set2power31_ps())
                ));

    } };
    template<> struct StaticCastHelper<double        , unsigned int  > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } };
    template<> struct StaticCastHelper<int           , unsigned int  > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param256i v) { return v; } };
    template<> struct StaticCastHelper<unsigned int  , unsigned int  > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param256i v) { return v; } };
    template<> struct StaticCastHelper<unsigned short, unsigned int  > { static Vc_ALWAYS_INLINE Vc_CONST m256i  cast(param128i   v) { return concat(_mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srli_epi32(_mm_unpackhi_epi16(v, v), 16)); } };
    template<> struct StaticCastHelper<float         , float         > { static Vc_ALWAYS_INLINE Vc_CONST m256   cast(param256  v) { return v; } };
    template<> struct StaticCastHelper<double        , float         > { static Vc_ALWAYS_INLINE Vc_CONST m256   cast(param256d v) { return avx_cast<m256>(_mm256_cvtpd_ps(v)); } };
    template<> struct StaticCastHelper<int           , float         > { static Vc_ALWAYS_INLINE Vc_CONST m256   cast(param256i v) { return _mm256_cvtepi32_ps(v); } };
    template<> struct StaticCastHelper<unsigned int  , float         > { static inline Vc_CONST m256   cast(param256i v) {
        return _mm256_blendv_ps(
                _mm256_cvtepi32_ps(v),
                _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_sub_epi32(v, _mm256_set2power31_epu32())), _mm256_set2power31_ps()),
                _mm256_castsi256_ps(_mm256_cmplt_epi32(v, _mm256_setzero_si256()))
                );
    } };
    template<> struct StaticCastHelper<short         , float         > { static Vc_ALWAYS_INLINE Vc_CONST m256  cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } };
    template<> struct StaticCastHelper<unsigned short, float         > { static Vc_ALWAYS_INLINE Vc_CONST m256  cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } };
    template<> struct StaticCastHelper<float         , double        > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256  v) { return _mm256_cvtps_pd(avx_cast<m128>(v)); } };
    template<> struct StaticCastHelper<double        , double        > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v) { return v; } };
    template<> struct StaticCastHelper<int           , double        > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } };
    template<> struct StaticCastHelper<unsigned int  , double        > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } };
    template<> struct StaticCastHelper<int           , short         > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packs_epi32(lo128(v), hi128(v)); } };
    template<> struct StaticCastHelper<float         , short         > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256  v) { return StaticCastHelper<int, short>::cast(StaticCastHelper<float, int>::cast(v)); } };
    template<> struct StaticCastHelper<short         , short         > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
    template<> struct StaticCastHelper<unsigned short, short         > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
    template<> struct StaticCastHelper<unsigned int  , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packus_epi32(lo128(v), hi128(v)); } };
    template<> struct StaticCastHelper<float         , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256  v) { return StaticCastHelper<unsigned int, unsigned short>::cast(StaticCastHelper<float, unsigned int>::cast(v)); } };
    template<> struct StaticCastHelper<short         , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
    template<> struct StaticCastHelper<unsigned short, unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
    template<> struct StaticCastHelper<sfloat        , short         > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256  v) { return StaticCastHelper<int, short>::cast(StaticCastHelper<float, int>::cast(v)); } };
    template<> struct StaticCastHelper<sfloat        , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256  v) { return StaticCastHelper<unsigned int, unsigned short>::cast(StaticCastHelper<float, unsigned int>::cast(v)); } };
    template<> struct StaticCastHelper<short         , sfloat        > { static Vc_ALWAYS_INLINE Vc_CONST m256  cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } };
    template<> struct StaticCastHelper<unsigned short, sfloat        > { static Vc_ALWAYS_INLINE Vc_CONST m256  cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } };
} // namespace AVX
} // namespace Vc
} // namespace AliRoot

#include "undomacros.h"

#endif // AVX_CASTS_H
Commit	Line	Data
f22341db	1	/* This file is part of the Vc library.
	2
	3	Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
	4
	5	Vc is free software: you can redistribute it and/or modify
	6	it under the terms of the GNU Lesser General Public License as
	7	published by the Free Software Foundation, either version 3 of
	8	the License, or (at your option) any later version.
	9
	10	Vc is distributed in the hope that it will be useful, but
	11	WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	GNU Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with Vc. If not, see <http://www.gnu.org/licenses/>.
	17
	18	*/
	19
	20	#ifndef AVX_CASTS_H
	21	#define AVX_CASTS_H
	22
	23	#include "intrinsics.h"
	24	#include "types.h"
c017a39f	25	#include "macros.h"
f22341db	26
c017a39f	27	namespace AliRoot {
f22341db	28	namespace Vc
	29	{
	30	namespace AVX
	31	{
c017a39f	32	template<typename T> static Vc_INTRINSIC_L T avx_cast(param128 v) Vc_INTRINSIC_R;
	33	template<typename T> static Vc_INTRINSIC_L T avx_cast(param128i v) Vc_INTRINSIC_R;
	34	template<typename T> static Vc_INTRINSIC_L T avx_cast(param128d v) Vc_INTRINSIC_R;
	35	template<typename T> static Vc_INTRINSIC_L T avx_cast(param256 v) Vc_INTRINSIC_R;
	36	template<typename T> static Vc_INTRINSIC_L T avx_cast(param256i v) Vc_INTRINSIC_R;
	37	template<typename T> static Vc_INTRINSIC_L T avx_cast(param256d v) Vc_INTRINSIC_R;
	38
	39	#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
	40	template<typename T> static Vc_INTRINSIC T avx_cast(__m128 v) { return avx_cast<T>(param128 (v)); }
	41	template<typename T> static Vc_INTRINSIC T avx_cast(__m128i v) { return avx_cast<T>(param128i(v)); }
	42	template<typename T> static Vc_INTRINSIC T avx_cast(__m128d v) { return avx_cast<T>(param128d(v)); }
	43	template<typename T> static Vc_INTRINSIC T avx_cast(__m256 v) { return avx_cast<T>(param256 (v)); }
	44	template<typename T> static Vc_INTRINSIC T avx_cast(__m256i v) { return avx_cast<T>(param256i(v)); }
	45	template<typename T> static Vc_INTRINSIC T avx_cast(__m256d v) { return avx_cast<T>(param256d(v)); }
	46	#endif
f22341db	47
f22341db	48	// 128 -> 128
c017a39f	49	template<> Vc_INTRINSIC m128 avx_cast(param128 v) { return v; }
	50	template<> Vc_INTRINSIC m128 avx_cast(param128i v) { return _mm_castsi128_ps(v); }
	51	template<> Vc_INTRINSIC m128 avx_cast(param128d v) { return _mm_castpd_ps(v); }
	52	template<> Vc_INTRINSIC m128i avx_cast(param128 v) { return _mm_castps_si128(v); }
	53	template<> Vc_INTRINSIC m128i avx_cast(param128i v) { return v; }
	54	template<> Vc_INTRINSIC m128i avx_cast(param128d v) { return _mm_castpd_si128(v); }
	55	template<> Vc_INTRINSIC m128d avx_cast(param128 v) { return _mm_castps_pd(v); }
	56	template<> Vc_INTRINSIC m128d avx_cast(param128i v) { return _mm_castsi128_pd(v); }
	57	template<> Vc_INTRINSIC m128d avx_cast(param128d v) { return v; }
f22341db	58
f22341db	59	// 128 -> 256
c017a39f	60	// FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never
	61	// seen the cast not do what I want though: after a VEX-coded SSE instruction the register's
	62	// upper 128bits are zero. Thus using the same register as AVX register will have the upper
	63	// 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory
	64	// + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do
	65	// what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck,
	66	// do we really want to rely on specific compiler behavior here?
	67	template<> Vc_INTRINSIC m256 avx_cast(param128 v) { return _mm256_castps128_ps256(v); }
	68	template<> Vc_INTRINSIC m256 avx_cast(param128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
	69	template<> Vc_INTRINSIC m256 avx_cast(param128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
	70	template<> Vc_INTRINSIC m256i avx_cast(param128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
	71	template<> Vc_INTRINSIC m256i avx_cast(param128i v) { return _mm256_castsi128_si256(v); }
	72	template<> Vc_INTRINSIC m256i avx_cast(param128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
	73	template<> Vc_INTRINSIC m256d avx_cast(param128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
	74	template<> Vc_INTRINSIC m256d avx_cast(param128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
	75	template<> Vc_INTRINSIC m256d avx_cast(param128d v) { return _mm256_castpd128_pd256(v); }
	76
	77	#ifdef VC_MSVC
	78	static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
	79	static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
	80	static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
	81	#else
	82	static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_castps128_ps256(v); }
	83	static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_castsi128_si256(v); }
	84	static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_castpd128_pd256(v); }
	85	#ifdef VC_ICC
	86	static Vc_INTRINSIC Vc_CONST m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); }
	87	static Vc_INTRINSIC Vc_CONST m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
	88	static Vc_INTRINSIC Vc_CONST m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
	89	#endif
	90	#endif
f22341db	91
f22341db	92	// 256 -> 128
c017a39f	93	template<> Vc_INTRINSIC m128 avx_cast(param256 v) { return _mm256_castps256_ps128(v); }
	94	template<> Vc_INTRINSIC m128 avx_cast(param256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
	95	template<> Vc_INTRINSIC m128 avx_cast(param256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
	96	template<> Vc_INTRINSIC m128i avx_cast(param256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
	97	template<> Vc_INTRINSIC m128i avx_cast(param256i v) { return _mm256_castsi256_si128(v); }
	98	template<> Vc_INTRINSIC m128i avx_cast(param256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
	99	template<> Vc_INTRINSIC m128d avx_cast(param256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
	100	template<> Vc_INTRINSIC m128d avx_cast(param256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
	101	template<> Vc_INTRINSIC m128d avx_cast(param256d v) { return _mm256_castpd256_pd128(v); }
f22341db	102
f22341db	103	// 256 -> 256
c017a39f	104	template<> Vc_INTRINSIC m256 avx_cast(param256 v) { return v; }
	105	template<> Vc_INTRINSIC m256 avx_cast(param256i v) { return _mm256_castsi256_ps(v); }
	106	template<> Vc_INTRINSIC m256 avx_cast(param256d v) { return _mm256_castpd_ps(v); }
	107	template<> Vc_INTRINSIC m256i avx_cast(param256 v) { return _mm256_castps_si256(v); }
	108	template<> Vc_INTRINSIC m256i avx_cast(param256i v) { return v; }
	109	template<> Vc_INTRINSIC m256i avx_cast(param256d v) { return _mm256_castpd_si256(v); }
	110	template<> Vc_INTRINSIC m256d avx_cast(param256 v) { return _mm256_castps_pd(v); }
	111	template<> Vc_INTRINSIC m256d avx_cast(param256i v) { return _mm256_castsi256_pd(v); }
	112	template<> Vc_INTRINSIC m256d avx_cast(param256d v) { return v; }
f22341db	113
f22341db	114	// simplify splitting 256-bit registers in 128-bit registers
c017a39f	115	Vc_INTRINSIC Vc_CONST m128 lo128(param256 v) { return avx_cast<m128>(v); }
	116	Vc_INTRINSIC Vc_CONST m128d lo128(param256d v) { return avx_cast<m128d>(v); }
	117	Vc_INTRINSIC Vc_CONST m128i lo128(param256i v) { return avx_cast<m128i>(v); }
	118	Vc_INTRINSIC Vc_CONST m128 hi128(param256 v) { return _mm256_extractf128_ps(v, 1); }
	119	Vc_INTRINSIC Vc_CONST m128d hi128(param256d v) { return _mm256_extractf128_pd(v, 1); }
	120	Vc_INTRINSIC Vc_CONST m128i hi128(param256i v) { return _mm256_extractf128_si256(v, 1); }
f22341db	121
f22341db	122	// simplify combining 128-bit registers in 256-bit registers
c017a39f	123	Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
	124	Vc_INTRINSIC Vc_CONST m256d concat(param128d a, param128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
	125	Vc_INTRINSIC Vc_CONST m256i concat(param128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
	126	#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
	127	Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, param128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
	128	Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, param128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
	129	Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, param128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
	130	Vc_INTRINSIC Vc_CONST m256 concat(param128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
	131	Vc_INTRINSIC Vc_CONST m256d concat(param128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
	132	Vc_INTRINSIC Vc_CONST m256i concat(param128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
	133	Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast<m256 >(a), b, 1); }
	134	Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast<m256d>(a), b, 1); }
	135	Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast<m256i>(a), b, 1); }
	136	#endif
f22341db	137
f22341db	138	template<typename From, typename To> struct StaticCastHelper {};
c017a39f	139	template<> struct StaticCastHelper<float , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256 v) { return _mm256_cvttps_epi32(v); } };
	140	template<> struct StaticCastHelper<double , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } };
	141	template<> struct StaticCastHelper<int , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
	142	template<> struct StaticCastHelper<unsigned int , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
	143	template<> struct StaticCastHelper<short , int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)); } };
	144	template<> struct StaticCastHelper<float , unsigned int > { static inline Vc_CONST m256i cast(param256 v) {
f22341db	145	return _mm256_castps_si256(_mm256_blendv_ps(
f22341db	146	_mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
c017a39f	147	_mm256_castsi256_ps(_mm256_add_epi32(m256i(_mm256_cvttps_epi32(_mm256_sub_ps(v, _mm256_set2power31_ps()))), _mm256_set2power31_epu32())),
f22341db	148	_mm256_cmpge_ps(v, _mm256_set2power31_ps())
	149	));
	150
	151	} };
c017a39f	152	template<> struct StaticCastHelper<double , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast<m256i>(_mm256_cvttpd_epi32(v)); } };
	153	template<> struct StaticCastHelper<int , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
	154	template<> struct StaticCastHelper<unsigned int , unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } };
	155	template<> struct StaticCastHelper<unsigned short, unsigned int > { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srli_epi32(_mm_unpackhi_epi16(v, v), 16)); } };
	156	template<> struct StaticCastHelper<float , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256 v) { return v; } };
	157	template<> struct StaticCastHelper<double , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256d v) { return avx_cast<m256>(_mm256_cvtpd_ps(v)); } };
	158	template<> struct StaticCastHelper<int , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256i v) { return _mm256_cvtepi32_ps(v); } };
	159	template<> struct StaticCastHelper<unsigned int , float > { static inline Vc_CONST m256 cast(param256i v) {
f22341db	160	return _mm256_blendv_ps(
	161	_mm256_cvtepi32_ps(v),
	162	_mm256_add_ps(_mm256_cvtepi32_ps(_mm256_sub_epi32(v, _mm256_set2power31_epu32())), _mm256_set2power31_ps()),
	163	_mm256_castsi256_ps(_mm256_cmplt_epi32(v, _mm256_setzero_si256()))
	164	);
	165	} };
c017a39f	166	template<> struct StaticCastHelper<short , float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } };
	167	template<> struct StaticCastHelper<unsigned short, float > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } };
	168	template<> struct StaticCastHelper<float , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256 v) { return _mm256_cvtps_pd(avx_cast<m128>(v)); } };
	169	template<> struct StaticCastHelper<double , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v) { return v; } };
	170	template<> struct StaticCastHelper<int , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } };
	171	template<> struct StaticCastHelper<unsigned int , double > { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast<m128i>(v)); } };
	172	template<> struct StaticCastHelper<int , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packs_epi32(lo128(v), hi128(v)); } };
	173	template<> struct StaticCastHelper<float , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<int, short>::cast(StaticCastHelper<float, int>::cast(v)); } };
	174	template<> struct StaticCastHelper<short , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
	175	template<> struct StaticCastHelper<unsigned short, short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
	176	template<> struct StaticCastHelper<unsigned int , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packus_epi32(lo128(v), hi128(v)); } };
	177	template<> struct StaticCastHelper<float , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<unsigned int, unsigned short>::cast(StaticCastHelper<float, unsigned int>::cast(v)); } };
	178	template<> struct StaticCastHelper<short , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
	179	template<> struct StaticCastHelper<unsigned short, unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } };
	180	template<> struct StaticCastHelper<sfloat , short > { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<int, short>::cast(StaticCastHelper<float, int>::cast(v)); } };
	181	template<> struct StaticCastHelper<sfloat , unsigned short> { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper<unsigned int, unsigned short>::cast(StaticCastHelper<float, unsigned int>::cast(v)); } };
	182	template<> struct StaticCastHelper<short , sfloat > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<short, int>::cast(v)); } };
	183	template<> struct StaticCastHelper<unsigned short, sfloat > { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper<unsigned short, unsigned int>::cast(v)); } };
f22341db	184	} // namespace AVX
f22341db	185	} // namespace Vc
c017a39f	186	} // namespace AliRoot
	187
	188	#include "undomacros.h"
f22341db	189
f22341db	190	#endif // AVX_CASTS_H